Exemple #1
0
    def setup(self):
        self.variables = {}

        filenames = []
        if not hasattr(self.config, 'enable'):
            for fn in os.listdir(os.path.join(home, 'modules')):
                if fn.endswith('.py') and not fn.startswith('_'):
                    filenames.append(os.path.join(home, 'modules', fn))
        else:
            for fn in self.config.enable:
                filenames.append(os.path.join(home, 'modules', fn + '.py'))

        if hasattr(self.config, 'extra'):
            for fn in self.config.extra:
                if os.path.isfile(fn):
                    filenames.append(fn)
                elif os.path.isdir(fn):
                    for n in os.listdir(fn):
                        if n.endswith('.py') and not n.startswith('_'):
                            filenames.append(os.path.join(fn, n))

        tools.setup(self)

        modules = {}
        excluded_modules = getattr(self.config, 'exclude', [])

        for filename in filenames:
            name = os.path.basename(filename)[:-3]
            if name in excluded_modules: continue

            try:
                module_loader = importlib.machinery.SourceFileLoader(
                    name, filename)
                module = module_loader.load_module()
            except Exception as e:
                trace = traceback.format_exc()
                logger.error("Error loading %s module:\n%s" % (name, trace))
                continue

            if module_control(self, module, 'setup'):
                self.register(module)
                modules[name] = module

        self.modules = modules

        if modules:
            logger.info('Registered modules: ' +
                        ', '.join(sorted(modules.keys())))
        else:
            logger.warning("Couldn't find any modules")

        self.bind_commands()
Exemple #2
0
    def setup(self): 
        self.variables = {}

        filenames = []
        if not hasattr(self.config, 'enable'): 
            for fn in os.listdir(os.path.join(home, 'modules')): 
                if fn.endswith('.py') and not fn.startswith('_'): 
                    filenames.append(os.path.join(home, 'modules', fn))
        else: 
            for fn in self.config.enable: 
                filenames.append(os.path.join(home, 'modules', fn + '.py'))

        if hasattr(self.config, 'extra'): 
            for fn in self.config.extra: 
                if os.path.isfile(fn): 
                    filenames.append(fn)
                elif os.path.isdir(fn): 
                    for n in os.listdir(fn): 
                        if n.endswith('.py') and not n.startswith('_'): 
                            filenames.append(os.path.join(fn, n))

        tools.setup(self)

        modules = {}
        excluded_modules = getattr(self.config, 'exclude', [])

        for filename in filenames: 
            name = os.path.basename(filename)[:-3]
            if name in excluded_modules: continue

            try:
                module_loader = importlib.machinery.SourceFileLoader(name, filename)
                module = module_loader.load_module()
            except Exception as e: 
                trace = traceback.format_exc()
                logger.error("Error loading %s module:\n%s" % (name, trace))
                continue

            if module_control(self, module, 'setup'):
                self.register(module)
                modules[name] = module

        self.modules = modules

        if modules: 
            logger.info('Registered modules: ' + ', '.join(sorted(modules.keys())))
        else:
            logger.warning("Couldn't find any modules")

        self.bind_commands()
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    preprocess = []

    if 'stoplist' in p.as_dict():
        stoplist = open(path.join(base_path, p['stoplist'])).readlines()
        stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist]
        def remove_stopwords(sentence):
            return [word for word in sentence if not word in stoplist]
        preprocess.append(remove_stopwords)

    if 'stemmer' in p.as_dict():
        stemmer = Stemmer.Stemmer(p['stemmer'])
        preprocess.append(stemmer.stemWords)

    if p['input'].endswith('.json'):
        cor = JsonCorpus(path.join(base_path, p['input']),
                         no_below=p['no_below'],
                         no_above=p['no_above'],
                         preprocess=preprocess)
    else:
        cor = TextFilesCorpus(path.join(base_path, p['input']),
                      no_below=p['no_below'],
                      no_above=p['no_above'],
                      preprocess=preprocess)

    MmCorpus.serialize(path.join(output_dir, p['corpus_name']), cor, progress_cnt=10000)
    cor.dictionary.save(path.join(output_dir, p['dict_name']))
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # in test case
    if param_file:
        files = [path.join(base_path, p['wiki_txt'])]
    else:
        files = glob.glob(path.join(base_path, p['wiki_txt']) + '*.txt')

    out = codecs.open(os.path.join(output_dir, 'wiki.json'), mode='w', encoding='utf-8')

    headline = re.compile('\[\[(.*)\]\]')
    level2 = re.compile('== (.*) ==')

    t0 = time.time()
    c = 0
    res = {}

    for file in files:
        print 'work on: %s' % file
        with codecs.open(file, encoding='utf-8') as f:
            for line in f:

                # ignore linebreaks
                if line == '\n':
                    continue

                # if headline found
                if headline.search(line):
                    if len(res) > 0:
                        out.write(json.dumps(res, encoding='utf-8', ensure_ascii=False) + '\n')
                    topic = headline.search(line).groups()[0]
                    res = {topic: {}}
                    sub = None

                elif level2.search(line):
                    sub = level2.search(line).groups()[0]
                else:
                    if not sub:
                        res[topic].setdefault('desc', []).append(line.strip())
                    else:
                        res[topic].setdefault(sub, []).append(line.strip())
        c += 1
        print 'average execution time: %f' % ((time.time() - t0) / c)
    out.write(json.dumps(res, encoding='utf-8', ensure_ascii=False) + '\n')

    print time.time() - t0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    inp = codecs.open(os.path.join(p['base_path'],
                                   p['corpora_path'],
                                   p['corpus_name']),
                      mode='r', encoding='utf-8')
    out = codecs.open(os.path.join(output_dir,
                                   p['result_name']),
                      mode='w', encoding='utf-8')
    pair = re.compile('\d\.(\w+):(\w+)')
    exclude = set(string.punctuation)

    line_count = 0
    res = []

    for line in inp:
        
        # skip empty lines
        if line == "\n":
            continue
        
        # finished one entry
        if line_count % 5 == 0:
            print pair.search(line).groups()
            res.append({'terms': pair.search(line).groups(),
                        'sentences': [],
                        'sentences_tagged': [],
                        'values': []})

        # annotate sentence and add it to result
        if line_count % 5 == 1 or line_count % 5 == 2:
            res[-1]['sentences'].append(line.strip())
            cleaned = "".join(ch for ch in line.strip() if ch not in exclude)
            tagged = tools.tag(cleaned, p['senna_path'])
            res[-1]['sentences_tagged'].append(tagged)

        # add the ratings
        if line_count % 5 == 3 or line_count % 5 == 4:
            res[-1]['values'].append(float(line))

        line_count = line_count+1
    
    # store the output
    json.dump(res, out, indent=2)
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    logger.info('load the articles..')
    article_path = path.join(result_path, p['article_label'])
    wiki = pickle.load(open(path.join(article_path, 'articles.pickle')))

    logger.info('load dictionary and models')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['model_label'],
                                           'dic.dict'))
    model_path = path.join(result_path, p['model_label'])
    lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
    pre = pickle.load(open(path.join(model_path, 'pre.model')))
    if int(p['num_topics']) > lsi.num_topics:
        logger.error('model to small')
    lsi.num_topics = int(p['num_topics'])

    data = {}
    for topic, entries in wiki.iteritems():
        logger.info('working on: %s' % topic)

        data[topic] = {}
        data[topic]['keys'] = []
        vecs = []
        data[topic]['ratings'] = []
        for key, val in entries.iteritems():
            data[topic]['keys'].append(key)
            vecs.append(lsi[pre[dictionary.doc2bow(val['text'])]])
            data[topic]['ratings'].append(val['rating'])
        data[topic]['vecs'] = np.squeeze(np.array(vecs)[:, :, 1:2]).T

        U, d, _ = np.linalg.svd(data[topic]['vecs'], full_matrices=False)
        data[topic]['U'] = U
        data[topic]['d'] = d

    f = open(os.path.join(output_dir, "data.pickle"), 'wb')
    pickle.dump(data, f)
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # initializations
    articles = {}
    all_missing = []
    redir_on = {}
    collisions = {}
    non_ascii = []
    site = mwclient.Site('en.wikipedia.org', '/w/api.php/')

    # get all txt files in a folder and iterate over them
    filelist = glob.glob(os.path.join(base_path,
                                      p['folder_path'],
                                      "*.txt"))
    for f in filelist:

        # get the word we are working on
        f_name = os.path.basename(f)
        k_word = os.path.splitext(f_name)[0]
        logger.info("working on file: %s" % f_name)

        # try to convert the word into ascii for the http query
        file_obj = codecs.open(f, "r", "utf-16")
        counter = 0
        words = []
        for w in file_obj.readlines():
            try:
                s = w.strip().decode('ascii')
                words.append(s)
            except Exception:
                counter += 1
                non_ascii.append(w.strip())
        logger.info("\t%d words containing non ascii are ommited" % counter)

        articles[k_word] = {}
        logger.info("\tfound %d words in file" % len(words))

        for word in words:
            data = {}
            page = site.Pages[word]

            # follow the redirect and check for collisions
            if page.redirect:
                res = re.search('\[\[(.+)\]\]', page.edit())
                redir_word = urllib.unquote(res.groups()[0])
                if redir_word in redir_on:
                    logger.warning("[%s AND %s] both redirect on --> %s" %
                                    (word, redir_on[redir_word], redir_word))
                    collisions[redir_word] = redir_on[redir_word]
                else:
                    logger.info("[%s] redir from [%s]" % (redir_word, word))
                    redir_on[redir_word] = word
                text = site.Pages[redir_word].edit()
                data['redirected'] = redir_word

            else:
                text = page.edit()

            # check for missing wikipedia articles
            if  text == "":
                all_missing.append(word)
                continue

            # preprocess the received article
            data['text'] = wikicorpus.filter_wiki(text)
            in_ascii = ud.normalize('NFKD',
                                    data['text']).encode('ascii', 'ignore')
            data['text'] = preprocess_string(in_ascii)
            articles[k_word][word] = data

    logger.info('add human rating to the articles')
    id_word = {}
    sparql_path = os.path.join(base_path, p['sparql_path'])
    with open(os.path.join(sparql_path, 'id_word.txt')) as f:
        for line in f.readlines():
            idx, word = line.strip().split('\t')
            id_word[idx] = word

    #add human rating to the wikipedia data
    not_found = []
    with open(os.path.join(sparql_path, p['human_file'])) as f:
        for line in f.readlines():
            arr = line.split()
            word = id_word[arr[0]]
            term = arr[3]
            try:
                articles[word][term]['rating'] = int(arr[4])
            except KeyError:
                not_found.append(term)
    logger.info("%d words from the ref queries not found" % len(not_found))

    f = open(os.path.join(output_dir, "articles.pickle"), 'wb')
    pickle.dump(articles, f)
    f.close

    info = {}
    info['missing'] = all_missing
    info['redirs'] = redir_on
    info['collisions'] = collisions
    info['not_found'] = not_found
    info['non_ascii'] = non_ascii
    f = open(os.path.join(output_dir, "info.pickle"), 'wb')
    pickle.dump(info, f)
    f.close

    logger.info("%d redirecting collisions (see info.pkl)" % len(collisions))
Exemple #8
0
def system_check():
    sys = t.setup()
    return sys
Exemple #9
0
def main(profile="default"):
    """Main function that is run to start application"""
    global SCORE, PROFILE

    # run setup, check if everything is alright
    setup()

    # load profile
    PROFILE = load_profile(profile_name=profile)

    # initialize board and score
    board = [[0]*4 for i in range(4)]
    SCORE = 0

    # initialize pygame
    pygame.init()
    fpsClock = pygame.time.Clock()
    screen = pygame.display.set_mode((600, 600))
    pygame.display.set_caption("x800 - a 2048 clone")

    # back button surface
    back_button = pygame.font.SysFont("monospace", 25).render(" <> Back to menu", 1, PROFILE["text_color"], PROFILE["bg_color"])

    # load default tile set
    tile_set = load_package(PROFILE["tile_set_name"])

    # set first random tile
    board[randrange(4)][randrange(4)] = choice([2, 4])

    # varible to know the state of the game
    game_state = "normal"    

    # main loop
    while True:

        if game_over(board):          
            if pop_up(screen, fpsClock, msg="Play again? (y/n)"):

                # reset game
                board = [[0]*4 for i in range(4)] # reset board
                SCORE = 0 # reset score
                board[randrange(4)][randrange(4)] = choice([2, 4]) # set 1st random tile 
                screen.fill(PROFILE["bg_color"]) # fill with background color

            else:
                terminate()

        elif game_won(board) and game_state != "won+":
            game_state = "won"
            if pop_up(screen, fpsClock, msg="You winner! Continue? (y/n)"):
                game_state = "won+"

            else:

                # reset game
                board = [[0]*4 for i in range(4)] # reset board
                SCORE = 0 # reset score
                board[randrange(4)][randrange(4)] = choice([2, 4]) # set 1st random tile 
                screen.fill(PROFILE["bg_color"]) # fill with background color

        # update best score if needed
        if SCORE > PROFILE["best_score"]: PROFILE["best_score"] = SCORE

        # event loop
        for event in pygame.event.get():
            if event.type == QUIT: terminate()

            # when a key is unpressed move the board
            elif event.type == KEYUP:
                if event.key == K_UP:
                    board = move(board, "UP")

                elif event.key == K_DOWN:
                    board = move(board, "DOWN")

                elif event.key == K_LEFT:
                    board = move(board, "LEFT")

                elif event.key == K_RIGHT:
                    board = move(board, "RIGHT")

            # when the mouse is clicked
            elif event.type == MOUSEBUTTONUP:
                # if button was clicked
                if back_button.get_rect(bottomleft=screen.get_rect().bottomleft).collidepoint(pygame.mouse.get_pos()):
                    terminate(load=True)
        
        screen.fill(PROFILE["bg_color"]) # background color

        # blit the board surface to the center of the screen
        board_surf = get_board_surf(board, tile_set)
        board_pos = board_surf.get_rect(center=screen.get_rect().center)
        screen.blit(board_surf, board_pos)
        
        # get score surface and blit to the top
        score_surf = get_score_surf(SCORE, best=PROFILE["best_score"], msg=PROFILE["score_msg"])
        score_pos = score_surf.get_rect(midbottom=board_pos.midtop)
        screen.blit(score_surf, score_pos)

        # get and blit greetting text and top of score
        greetting_surf = get_user_surf()
        greetting_pos = greetting_surf.get_rect(midbottom=score_pos.midtop)
        screen.blit(greetting_surf, greetting_pos)

        # blit back button surface
        screen.blit(back_button, back_button.get_rect(bottomleft=(screen.get_rect().bottomleft)))
        
        # update screen and game clock
        pygame.display.flip()
        fpsClock.tick(60) # FPS = 60
 def connect(self):
     # Initiate the connection to Twitter Streaming API
     return TwitterStream(auth=tools.setup())
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    model_path = path.join(base_path,
                           p['result_path'],
                           p['model_label'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # train the model on the small marketing corpus
    preprocess = []

    if 'stoplist' in p.as_dict():
        stoplist = open(path.join(base_path, p['stoplist'])).readlines()
        stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist]
        def remove_stopwords(sentence):
            return [word for word in sentence if not word in stoplist]
        preprocess.append(remove_stopwords)

    if 'stemmer' in p.as_dict():
        stemmer = Stemmer.Stemmer(p['stemmer'])
        preprocess.append(stemmer.stemWords)

    if not p['model_label']:
        cor = TextFilesCorpus(path.join(base_path, p['corpus_path']),
                              no_below=p['no_below'],
                              no_above=p['no_above'],
                              preprocess=preprocess)
        dictionary = cor.dictionary

        pre = LogEntropyModel(cor, id2word=dictionary, normalize=True)
        lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics'])
    else:
        dictionary = Dictionary.load(path.join(model_path, p['dict_name']))
        pre = SaveLoad.load(path.join(model_path, 'pre.model'))
        lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
        lsi.num_topics = p['num_topics']

    test_cor_path = path.join(base_path, p['test_cor_path'])
    test_answers, gold_answers, ratings = [], [], []


    flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt'))
    for file in flist:
        match = re.search('data3_(\d)_\d+.txt', file)
        ratings.append(int(match.group(1)))
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            test_answers.append(corpus)
    flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt'))
    for file in flist:
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            gold_answers.append(corpus)


    sim = MatrixSimilarity(test_answers)[gold_answers]
    mean_sim = np.mean(sim, axis=0)
    print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1]
    print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)
Exemple #12
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    logger.info('loading models and dictionary')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['model_label'],
                                           'dic.dict'))
    model_path = path.join(result_path, p['model_label'])
    lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
    pre = pickle.load(open(path.join(model_path, 'pre.model')))
    lsi.num_topics = p['num_topics']

    logger.info('load wikipedia articles')
    article_path = path.join(result_path, p['article_label'])
    wiki = pickle.load(open(path.join(article_path, 'articles.pickle')))

    times = np.zeros((1, len(wiki)))
    count = 0
    for query_key, query in wiki.iteritems():
        logger.info("working on: %s" % query_key)
        n = len(query)
        human = [val['rating'] for val in query.itervalues()]

        t0 = time.time()
        corpus = [lsi[pre[dictionary.doc2bow(val['text'])]]
                    for val in query.itervalues()]
        sim_res = MatrixSimilarity(corpus)[corpus]
        sim_res.save(path.join(output_dir, 'sim_' + query_key))
        avg = np.mean(sim_res, axis=0)
        idx = np.argsort(avg)
        times[count] = time.time() - t0

        # compute correlation with human rating
        res = np.zeros((n, 1))
        for i in range(n):
            human_r = [human[j] for j in idx[i:]]
            res[i, 0] = np.mean(human_r)

        # plot correlation
        fig = plt.figure()
        ax = fig.add_subplot(3, 1, 1)
        ax.plot(res)

        ax = fig.add_subplot(3, 1, 2)
        ratings = [val['rating'] for val in query.itervalues()]
        ax.scatter(avg[idx], [ratings[i] for i in idx])

        # plot similarity distribution
        ax = fig.add_subplot(3, 1, 3)
        ax.bar(range(n), avg[idx])

        # Set the x tick labels to the group_labels defined above and rotate
        ax.set_xticks(range(n))
        k = [key + ' ' + str(query[key]['rating']) for key in query.keys()]
        ax.set_xticklabels([k[i] for i in idx])
        fig.autofmt_xdate()
        plt.savefig(path.join(output_dir, query_key + '.' + p['format']))
        plt.close()
    logger.info('average similarity calculation time: %f' % np.mean(times))
Exemple #14
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    # load model and corpus
    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['run'], p['dict_extension']))

    model_path = path.join(result_path, p['run'], p['lsi_ext'])
    logger.info('load model from: %s' % model_path)
    lsi = LsiModel.load(model_path)
    pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext']))

    logging.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (only pre model)')
    corpus_pre = pre[bow_lee_texts]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file']))
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    max_topics = lsi.num_topics

    logger.info("iterate from %d to %d dimensions (stepsize: %d)" %
                (p['min_dim'], max_topics, p['dim_step']))

    iter_range = range(p['min_dim'], max_topics, p['dim_step'])
    res = np.zeros(len(iter_range))
    for k, l in enumerate(iter_range):

        # do the lower dimensionality transformation
        lsi.num_topics = l
        corpus_lsi = lsi[corpus_pre]

        # compute pairwise similarity matrix of transformed corpus
        sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                sim_matrix[i, j] = matutils.cossim(par1, par2)
        sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

        # compute correlations
        cor = np.corrcoef(sim_vector, human_sim_vector)
        logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1]))
        res[k] = cor[0, 1]

    plt.figure()
    plt.plot(iter_range, res)
    plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension']))
    plt.close()
    np.save(path.join(output_dir, 'model_dim_res.npy'), res)

    dif = datetime.now() - start
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
Exemple #15
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name'])
    human_data_file = path.join(base_path, p['human_data_file'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(base_path,
                                           p['corpus_path'],
                                           p['dict_name']))
    Dictionary.save(dictionary, path.join(output_dir, p['dict_name']))
    logger.info(dictionary)

    logger.info('loading corpus')
    corpus_bow = MmCorpus(working_corpus)

    logger.info("create preprocessing model and save it to disk")
    if p['pre_model'] == 'tfidf':
        pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True)
    elif p['pre_model'] == 'log_ent':
        pre_model = LogEntropyModel(corpus_bow,
                                    id2word=dictionary, normalize=True)
    else:
        raise ValueError('model parameter %s not known' % p['pre_model'])
    pre_model.save(os.path.join(output_dir, p['pre_model_extension']))

    logger.info('initialize LSI model')
    lsi = models.LsiModel(pre_model[corpus_bow],
                          id2word=dictionary, num_topics=p['num_topics'])
    lsi.save(os.path.join(output_dir, p['lsi_extension']))
    logger.info('finished --> lsi model saved to: %s' %
                os.path.join(output_dir, p['lsi_extension']))

    # check for correlation with lee human data
    logger.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (LSI)')
    corpus_lsi = lsi[pre_model[bow_lee_texts]]

    # # compute pairwise similarity matrix of transformed corpus
    sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
    for i, par1 in enumerate(corpus_lsi):
        for j, par2 in enumerate(corpus_lsi):
            sim_matrix[i, j] = matutils.cossim(par1, par2)
    sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(human_data_file)
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    # compute correlations
    cor = np.corrcoef(sim_vector, human_sim_vector)
    logger.info("correlation with lee human data: %f" % cor[0, 1])

    dif = start - datetime.now()
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
Exemple #16
0
        save_profile(self.profile_to_save, self.profile_to_save["user"])
        
        StartMenu().run()

    def show_preview(self):
        """Calls function preview from the tools module"""

        preview(self.get_profile_dict())

    def build(self):
        """Packs widgets"""

        # put widgets in grid
        for i, widget in enumerate(self.widgets):
            widget["label"].grid(row=i, column=0)
            widget["entry"].grid(row=i, column=1)

        # put the save and preview buttons on the bottom
        self.save_button.grid(row=6, column=0)
        self.preview_button.grid(row=6, column=1)

    def run(self):
        """Starts menu application"""
        
        self.build()
        self.root.mainloop()

if __name__ == "__main__":
    setup() # make sure everything is ok
    StartMenu().run()
Exemple #17
0
 def connect(self):
     # Initiate the connection to Twitter REST API
     return Twitter(auth=tools.setup())