Python Dictionary.id2tokenの例

プログラミング言語: Python

名前空間/パッケージ名: gensim.corpora.dictionary

クラス/型: Dictionary

メソッド/関数: id2token

hotexamples.comのコード掲載数: 5

Python Dictionary.id2token - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのgensim.corpora.dictionary.Dictionary.id2tokenの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Dictionary(30)

items(30)

save(30)

doc2bow(30)

filter_extremes(30)

load(30)

add_documents(30)

get(23)

load_from_text(19)

from_corpus(16)

doc2idx(12)

compactify(9)

save_as_text(8)

keys(6)

token2id(4)

itervalues(4)

id2token(4)

filter_tokens(3)

values(3)

merge_with(2)

num_docs(2)

num_nnz(2)

num_pos(2)

dfs(2)

iteritems(1)

from_documents(1)

filter_n_most_frequent(1)

filterExtremes(1)

patch_with_special_tokens(1)

corpus_id2orig_id(1)

corpus(1)

コード例 #1

ファイルを表示

ファイル: preprocess_single_word_lda_data.py プロジェクト: reubenharry/Compositional-Latent-Dirichlet-Allocation

def text_to_num(texts):

	texts = [clean(text) for text in texts]

	common_dictionary = Dictionary(texts)
	common_dictionary.id2token = dict([(common_dictionary.token2id[x]+1,x) for x in common_dictionary.token2id.keys()])
	common_corpus = [[common_dictionary.token2id[word]+1 for word in text] for text in texts]

	return common_corpus, common_dictionary.token2id, common_dictionary.id2token

コード例 #2

ファイルを表示

ファイル: preprocess.py プロジェクト: kensk8er/MsTweetAnalysis

def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary

コード例 #3

ファイルを表示

    def setUp(self):
        # Set up toy example for better understanding and testing
        # of this module. See the modules for the mathematical formulas
        self.topics = [np.array([1, 2])]
        # Result from s_one_set segmentation:
        self.segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        self.gamma = 1
        self.measure = 'nlr'

        dictionary = Dictionary()
        dictionary.id2token = {1: 'fake', 2: 'tokens'}
        self.accumulator = text_analysis.InvertedIndexAccumulator({1, 2},
                                                                  dictionary)
        self.accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
        self.accumulator._num_docs = 5

コード例 #4

ファイルを表示

ファイル: pytopia2gensim.py プロジェクト: dkorenci/doc-topic-coherence

def pytopia2gensimDict(dict_):
    '''
    Creates gensim dictionary from a pytopia dictionary.
    This is necessary since building of gensim models requires gensim dictionary
     but pytopia model builders must be able to receive generic pytopia Dictionary as parameter.
    '''
    # sort dictionary tokens by index
    dict_ = resolve(dict_)
    toki = [(tok, dict_[tok]) for tok in dict_]
    toki.sort(key=lambda ti: ti[1])
    # directly set gensim dict data structures,
    # this works for gensim 0.12.4
    gdict = GensimDict()
    gdict.token2id = {tok: i for tok, i in toki}
    gdict.id2token = {i: tok for tok, i in toki}
    gdict.dfs = {tok: 1 for tok, _ in toki}
    gdict.num_docs = 1  # number of documents processed
    gdict.num_pos = len(toki)  # total number of corpus positions
    gdict.num_nnz = len(toki)  # total number of non-zeroes in the BOW matrix
    return gdict

コード例 #5

ファイルを表示

from gensim.models import CoherenceModel

path_to_mallet_binary = 'd:/mallet-2.0.8/bin/mallet'
output_path = 'd:/code/gc_text_analysis/mallet_output/'
num_topics = 140
model = LdaMallet(path_to_mallet_binary,
                  corpus=bow_docs,
                  workers=4,
                  iterations=2000,
                  num_topics=num_topics,
                  id2word=dictionary,
                  prefix=output_path)

model.save('gc_lda_model.pkl')

dictionary.id2token = dict((v, k) for k, v in dictionary.token2id.items())
words_freq = [(dictionary.id2token[id], cnt)
              for id, cnt in dictionary.dfs.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
words_freq = pd.DataFrame(words_freq, columns=['word', 'count'])

coherence_model_lda = CoherenceModel(model=model,
                                     texts=ngram_docs,
                                     dictionary=dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

topics = model.show_topics(num_topics=num_topics,
                           num_words=10,
                           log=False,
                           formatted=False)