Ejemplo n.º 1
0
def cal_idf():
    # brown.sents()
    total_wordlists = []
    doc_sents = []
    for f in brown.fileids():
        print f
        doc_wordlist = []
        doc_sentlist = brown.sents(fileids=[f])
        d_sents = ''
        for sent in doc_sentlist:
            s = ''
            # sent = stem_tokens(sent)
            for w in sent:
                w = w.lower()
                s += w + ' '
            d_sents += s + '\n'
            doc_wordlist.extend(sent)
        total_wordlists.append(doc_wordlist)
        doc_sents.append(d_sents)
    print 'start caling tfidf'

    from sklearn.feature_extraction.text import TfidfVectorizer
    corpus = doc_sents
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus)
    idf = vectorizer.idf_
    # print dict(zip(vectorizer.get_feature_names(), idf))
    pickle.dump(vectorizer, open('idf_vectorizer', 'w'))
    dictionary = corpora.Dictionary(total_wordlists)
    dic, corps = get_corpus_by_lists(total_wordlists)
    tfidf = models.TfidfModel(corps, id2word=dic)
    pickle.dump(tfidf, open('brown_tfidf', 'w'))
Ejemplo n.º 2
0
def train_model_by_wordlists(wordlists, num_topics=5, iterations=100, passes=10, is_tfidf=False):
    c_result = basic_utils.get_corpus_by_lists(wordlists)
    dic = c_result[0]
    corpus = c_result[1]
    if is_tfidf:
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda_model = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=num_topics, iterations=iterations,
                                    passes=passes)
    else:
        lda_model = models.LdaModel(corpus, id2word=dic, num_topics=num_topics, iterations=iterations, passes=passes)

    return lda_model