Python Dictionary.itervalues Examples

Programming Language: Python

Namespace/Package Name: gensim.corpora.dictionary

Class/Type: Dictionary

Method/Function: itervalues

Examples at hotexamples.com: 4

Python Dictionary.itervalues - 4 examples found. These are the top rated real world Python examples of gensim.corpora.dictionary.Dictionary.itervalues extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Dictionary(30)

items(30)

save(30)

doc2bow(30)

filter_extremes(30)

load(30)

add_documents(30)

get(23)

load_from_text(19)

from_corpus(16)

doc2idx(12)

compactify(9)

save_as_text(8)

keys(6)

token2id(4)

itervalues(4)

id2token(4)

filter_tokens(3)

values(3)

merge_with(2)

num_docs(2)

num_nnz(2)

num_pos(2)

dfs(2)

iteritems(1)

from_documents(1)

filter_n_most_frequent(1)

filterExtremes(1)

patch_with_special_tokens(1)

corpus_id2orig_id(1)

corpus(1)

Example #1

Show file

File: eng_constructer.py Project: xiaoshengjun/NANHM-for-GEC

def construct_vocab():
    f = open(cfg.PATH_TO_ENG_Y_TRAIN)
    g = open(cfg.PATH_TO_ENG_Y_TEST)
    word_dic = Dictionary()
    char_dic = Dictionary()
    target_dic = Dictionary()
    word_dic.add_documents([["UNK", "EOS"]])
    char_dic.add_documents([["UNK", "BOW"]])
    target_dic.add_documents([["UNK", "BOW"]])

    line = f.readline()
    while line:
        sentence = _tokenize(line)
        word_dic.add_documents([sentence])
        char_dic.add_documents([get_chars(line)])
        target_dic.add_documents([sentence])
        target_dic.add_documents([get_chars(line)])
        line = f.readline()
    f.close

    line = g.readline()
    while line:
        sentence = _tokenize(line)
        word_dic.add_documents([sentence])
        char_dic.add_documents([get_chars(line)])
        line = g.readline()
    g.close
    return list(word_dic.itervalues()), list(char_dic.itervalues()), list(
        target_dic.itervalues())

Example #2

Show file

File: constructer.py Project: xiaoshengjun/NANHM-for-GEC

def construct_test(tagger):
    f = open(cfg.PATH_TO_VGR_domain_text2)
    g = open(cfg.PATH_TO_X_TEST, 'w')
    line = f.readline()
    word_dic = Dictionary()
    char_dic = Dictionary()
    word_dic.add_documents([["UNK", "EOS"]])
    char_dic.add_documents([["UNK", "BOW"]])
    while line:
        sentence = _tokenize(line, tagger)
        g.write(" ".join(sentence) + "\n")
        word_dic.add_documents([sentence])
        char_dic.add_documents([list(line)])
        line = f.readline()
    f.close
    g.close
    return list(word_dic.itervalues()), list(char_dic.itervalues())

Example #3

Show file

def main():
    logger.info('-' * 80)
    logger.info('Loading data')
    corpus = load_corpus(args.dataset_dir)

    logger.info('-' * 80)
    logger.info('Make dictionary')

    dictionary = Dictionary(corpus)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=TOKEN_MIN_DOCS,
                               no_above=TOKEN_MAX_DOCS_FRAC)

    vocab_path = os.path.join(args.dump_dir, 'vocab.txt')
    with open(vocab_path, 'w') as f:
        f.write("\n".join(dictionary.itervalues()) + '\n')

    # Bag-of-words representation of the documents.
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

    logger.info(f'Number of unique tokens: {len(dictionary)}')
    logger.info(f'Number of documents: {len(bow_corpus)}')

    logger.info('-' * 80)
    logger.info('Training model')

    callbacks = []
    if 'perplexity' in args.callbacks:
        perplexity_metric = PerplexityMetric(corpus=bow_corpus)
        callbacks.append(perplexity_metric)
    if 'coherence' in args.callbacks:
        coherence_metric = CoherenceMetric(texts=corpus,
                                           dictionary=dictionary,
                                           coherence=args.coherence,
                                           topn=args.topn)
        callbacks.append(coherence_metric)

    model_path = os.path.join(args.dump_dir, 'lda.model')
    if args.model == 'lda':
        model = LdaModel(corpus=bow_corpus,
                         num_topics=args.num_topics,
                         id2word=dictionary,
                         passes=args.num_epochs,
                         update_every=1,
                         eval_every=args.eval_every,
                         iterations=args.iterations,
                         alpha='auto',
                         eta='auto',
                         chunksize=args.batch_size,
                         callbacks=callbacks,
                         log_dir=args.log_dir,
                         model_dir=model_path)
    elif args.model == 'multicore_lda':
        model = LdaMulticore(corpus=bow_corpus,
                             num_topics=args.num_topics,
                             id2word=dictionary,
                             passes=args.num_epochs,
                             eval_every=args.eval_every,
                             iterations=args.iterations,
                             eta='auto',
                             chunksize=args.batch_size,
                             workers=args.workers,
                             callbacks=callbacks,
                             log_dir=args.log_dir,
                             model_dir=model_path)
    elif args.model == 'mallet_lda':
        model = LdaMallet(args.mallet_path,
                          corpus=bow_corpus,
                          num_topics=args.num_topics,
                          id2word=dictionary,
                          workers=args.workers,
                          prefix=os.path.join(args.dump_dir, 'mallet_'),
                          iterations=args.iterations)
    elif args.model == 'gensim_lda':
        model = GensimLdaModel(corpus=bow_corpus,
                               num_topics=args.num_topics,
                               id2word=dictionary,
                               passes=args.num_epochs,
                               update_every=1,
                               eval_every=args.eval_every,
                               iterations=args.iterations,
                               alpha='auto',
                               eta='auto',
                               chunksize=args.batch_size)
    elif args.model == 'gensim_multicore_lda':
        model = GensimLdaMulticore(corpus=bow_corpus,
                                   num_topics=args.num_topics,
                                   id2word=dictionary,
                                   passes=args.num_epochs,
                                   eval_every=args.eval_every,
                                   iterations=args.iterations,
                                   eta='auto',
                                   chunksize=args.batch_size,
                                   workers=args.workers)

    model.save(model_path)

    logger.info('-' * 80)

    if args.model != 'mallet_lda':
        top_topics = model.top_topics(texts=corpus, coherence='c_v')
        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        avg_topic_coherence = sum([t[1] for t in top_topics]) / args.num_topics
        logger.info(f'Average topic coherence: {avg_topic_coherence:.4f}.')
        for topic_idx, (topic_words, topic_score) in enumerate(top_topics):
            logger.info(f'Topic #{topic_idx} ({topic_score:.4f}): ' +
                        " ".join((t[1] for t in topic_words[:5])))
        logger.info(
            f'Perplexity: {np.exp2(-model.log_perplexity(bow_corpus)):.4f}')
    else:
        pprint(model.show_topics(formatted=False))

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model,
                                         texts=corpus,
                                         dictionary=dictionary,
                                         coherence=args.coherence,
                                         topn=args.topn)
    coherence_lda = coherence_model_lda.get_coherence()
    logger.info(f'Coherence : {coherence_lda:.4f}')

Example #4

Show file

            output_file_dist = os.path.join(curr_dir, f'distance_{args.smoothing}_{args.mu}')
        else:
            output_file_dist = os.path.join(curr_dir, 'distance_js')

        logging.info('creating the dictionary for ' + str(curr_iter) + '...')

        if curr_iter == 1:
            dict_file = os.path.join(curr_dir, 'dict.model')
            if os.path.exists(dict_file):
                logging.info(f'loading dictionary file from: {dict_file}')
                dictionary = Dictionary.load(dict_file)
            else:
                dictionary = Dictionary(dark_text)
                dictionary.add_documents(clean_text)

                word_dict = {word[1:]: word for word in dictionary.itervalues() if word.startswith('_')}
                _dict = Dictionary([[word] for word in word_dict.keys()])
                dictionary.merge_with(_dict)

                dictionary = filter_dict(args.vocab_size, dictionary, chain(word_dict.values(), word_dict.keys()))
                dictionary.save(dict_file)

        else:
            dict_file_prev = os.path.join(prev_dir, 'dict.model')
            dict_file = os.path.join(curr_dir, 'dict.model')
            mrr_file = os.path.join(prev_dir, 'ranking_list.csv')

            dictionary = Dictionary.load(dict_file_prev)
            word_dict = {word[1:]: word for word in dictionary.itervalues() if word.startswith('_')}
            words = get_dark_words_prev(mrr_file)