Esempio n. 1
0
def create_topic():
    # 存取语料库, 一行为一个文档
    corpus = []
    for line in open(documentfile, 'r').readlines():
        corpus.append(line.strip())
    # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    X = X.toarray()
    print(X.shape)
    # LDA算法
    model = lda.LDA(n_topics=5, n_iter=500, random_state=1)
    model.fit(np.asanyarray(X))
    topic_word = model.topic_word_

    # print(topic_word)
    # n_top_words = 8
    # for i, topic_dist in enumerate(topic_word):
    #     topic_words = [np.argsort(topic_dist)][:-(n_top_words+1):-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    # 文档-主题分布
    doc_topic = model.doc_topic_
    print("type(doc_topic): {}".format(type(doc_topic)))
    print("shape: {}".format(doc_topic.shape))

    # 输出前10篇文章最有可能的Topic
    label = []
    for n in range(10):
        topic_most_pr = doc_topic[n].argmax()
        label.append(topic_most_pr)
        print("doc: {} topic: {}".format(n, topic_most_pr))

    # 计算文档主题分布图
    f, ax = plt.subplots(2, 1, figsize=(6, 6), sharex=True)
    for i, k in enumerate([0, 1]):  # 两个主题
        ax[i].stem(topic_word[k, :],
                   linefmt='b-',
                   markerfmt='bo',
                   basefmt='w-')
        ax[i].set_xlim(-2, 20)
        ax[i].set_ylim(0, 1)
        ax[i].set_ylabel("Prob")
        ax[i].set_title("topic {}".format(k))
    ax[1].set_xlabel("word")
    plt.tight_layout()
Esempio n. 2
0
    def test_lda_zero_iter(self):
        dtm = self.dtm
        model = self.model
        doc_topic = self.doc_topic
        n_topics = self.n_topics
        random_seed = self.random_seed

        # fit a new model with 0 iterations
        n_iter = 0
        model_new = lda.LDA(n_topics=n_topics,
                            n_iter=n_iter,
                            random_state=random_seed)
        doc_topic_new = model_new.fit_transform(dtm)
        self.assertIsNotNone(model_new)
        self.assertIsNotNone(doc_topic_new)
        self.assertLess(model_new.loglikelihood(), model.loglikelihood())
        self.assertFalse((doc_topic_new == doc_topic).all())
Esempio n. 3
0
 def train_lda_model(self, n_topics):
     """
     function: 训练LDA模型
     :param: n_topic: 主题的个数
     :return: words_in_topic 主题内的词分布
     """
     model = lda.LDA(n_topics=n_topics, n_iter= self.n_iter, random_state=1)
     model.fit(self.vsm_model)  # 填充vsm模型
     topic_word = model.topic_word_
     loglikelihood = model.loglikelihoods_
     perplexity = loglikelihood.pop() * (-1.0) / self.vocabulary.__len__() * self.n_topics
     n_top_words = self.n_top_words  # 取topic前几个热词
     words_in_topic = dict()
     for i, topic_dict in enumerate(topic_word):
         topic_words = np.array(self.vocabulary)[np.argsort(topic_dict)][:-(n_top_words+1):-1]
         words_in_topic[i] = topic_words
     return words_in_topic, perplexity
Esempio n. 4
0
def test_get_topic_word_relevance(dtm, n_topics, lambda_):
    dtm = np.array(dtm)
    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.get_doc_lengths(dtm)

    rel_mat = model_stats.get_topic_word_relevance(model.topic_word_,
                                                   model.doc_topic_,
                                                   doc_lengths, lambda_)

    assert rel_mat.shape == (n_topics, dtm.shape[1])
    assert all(
        isinstance(x, float) and not np.isnan(x) for x in rel_mat.flatten())
Esempio n. 5
0
def ldaModel(texts, topics, iters, nWords, documents):
    #vocab, dtm = getVocab(texts, documents)
    bi_dtm, bi_reducedTextDic, bi_vocab = bagOfWords(texts, documents, True, 0,
                                                     False, False)
    uni_dtm, uni_reducedTextDic, uni_vocab = bagOfWords(
        texts, documents, False, 0, False, False)

    dtm = []
    for sub in range(len(uni_dtm)):
        dtm.append(np.concatenate((uni_dtm[sub], bi_dtm[sub])))
    dtm = np.asarray(dtm)
    vocab = uni_vocab + bi_vocab

    dtm = bi_dtm
    vocab = bi_vocab
    # limit to those that appear in TDs at least once?
    mean_occ = np.mean(dtm, axis=0)
    cleaned_dtm = []
    for sub in range(len(dtm)):
        temp_cleaned = []
        for occ in range(len(mean_occ)):
            if mean_occ[occ] >= 0:
                temp_cleaned.append(dtm[sub][occ])
        cleaned_dtm.append(np.asarray(temp_cleaned))
    cleaned_dtm = np.asarray(cleaned_dtm)

    dtm = cleaned_dtm

    model = lda.LDA(n_topics=topics, n_iter=iters, random_state=1)
    model.fit(dtm)
    topic_word = model.topic_word_
    n_top_words = nWords
    topic_words = {}
    for i, topic_dist in enumerate(topic_word):
        topic_words[i] = np.array(vocab)[np.argsort(
            topic_dist)][:-n_top_words:-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words[i])))
    doc_topic = model.doc_topic_

    for i in range(len(texts)):
        print("{} (top topic: {})".format(texts[i], doc_topic[i].argmax()))

    probs = np.array(doc_topic)
    meanProbs = np.mean(probs, axis=0)

    return topic_words, meanProbs, probs, vocab, dtm
Esempio n. 6
0
def main():
    print 'start getting vocabulary'
    if os.path.exists('../data/index_to_word.json') and os.path.exists('../data/word_to_index.json'):
        index_to_word, word_to_index = load_word_index()
    else:
        init_word_index()
        index_to_word, word_to_index = load_word_index()
    print 'finish getting vocabulary'

    print 'start getting doc size'
    doc_num = doc_count()
    print 'finish getting doc size'

    word_num = len(index_to_word)

    print 'start generating train data'
    X = sparse.lil_matrix((doc_num, word_num), dtype=np.int32)
    # with open('../data/arxiv_word_category_nltk.csv', 'rb') as fin:
    #     fin.readline()
    #     reader = csv.reader(fin)
    #     for i, (paper_id, words, category) in enumerate(reader):
    #         words = json.loads(words)
    #         for w in words:
    #             w = w.lower()
    #             if w in word_to_index:
    #                 X[i, word_to_index[w]] += 1
    with open('../data/arxiv_categories_words_fasttext.txt', 'rb') as fin:
        for i, line in enumerate(fin):
            words = line.split()
            for w in words:
                if not w.startswith('__label__'):
                    w = w.lower()
                    if w in word_to_index:
                        X[i, word_to_index[w]] += 1
    print 'finish generating train data'

    print 'start training'
    model = lda.LDA(n_topics=50)
    model.fit(X)
    print 'finish training'

    print 'start saving result'
    np.save('topic_word.np', model.topic_word_)
    np.save('doc_topic.np', model.doc_topic_)
    print 'finish saving result'
Esempio n. 7
0
def lda_generate_model(headlines, bodies):

    X = []

    # get train data
    templist = []
    # clean_headlines = []
    # clean_bodies = []
    # for headline in headlines:
    #     clean_headlines.append(clean(headline))
    # for body in bodies:
    #     clean_bodies.append(clean(body))
    clean_headlines = headlines
    clean_bodies = bodies

    # get test data
    test_stances = test.test
    test_dataset = data
    test_headlines, test_bodies = [], []
    for stance in test_stances:  #test has not been cleaned
        test_headlines.append(stance['Headline'])
        test_bodies.append(test_dataset.body[stance['Body ID']])

    # add train & test
    clean_headlines = list(set(clean_headlines))
    clean_bodies = list(set(clean_bodies))
    test_headlines = list(set(test_headlines))
    templist = clean_headlines + clean_bodies + test_headlines

    cv = CountVectorizer()
    cv_fit = cv.fit_transform(templist)
    cv_fit = cv_fit.toarray()

    model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
    model.fit(cv_fit)

    # get lda dict
    vec_dict = OrderedDict()
    doc_topic = model.doc_topic_
    i = 0
    for i in range(len(templist)):
        vec_dict[templist[i]] = doc_topic[i]

    print("lda_generate_model complete!")
    return vec_dict
Esempio n. 8
0
 def infer_topics(self, num_topics=10, algorithm='variational', **kwargs):
     self.nb_topics = num_topics
     lda_model = None
     topic_document = None
     if algorithm == 'variational':
         lda_model = LDA(n_topics=num_topics, learning_method='batch')
         topic_document = lda_model.fit_transform(
             self.corpus.sklearn_vector_space)
     elif algorithm == 'gibbs':
         lda_model = lda.LDA(n_topics=num_topics, n_iter=500)
         topic_document = lda_model.fit_transform(
             self.corpus.sklearn_vector_space)
     else:
         raise ValueError(
             "algorithm must be either 'variational' or 'gibbs', got '%s'" %
             algorithm)
     self.topic_word_matrix = []
     self.document_topic_matrix = []
     vocabulary_size = len(self.corpus.vocabulary)
     row = []
     col = []
     data = []
     for topic_idx, topic in enumerate(lda_model.components_):
         for i in range(vocabulary_size):
             row.append(topic_idx)
             col.append(i)
             data.append(topic[i])
     self.topic_word_matrix = coo_matrix(
         (data, (row, col)),
         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     row = []
     col = []
     data = []
     doc_count = 0
     for doc in topic_document:
         topic_count = 0
         for topic_weight in doc:
             row.append(doc_count)
             col.append(topic_count)
             data.append(topic_weight)
             topic_count += 1
         doc_count += 1
     self.document_topic_matrix = coo_matrix(
         (data, (row, col)),
         shape=(self.corpus.size, self.nb_topics)).tocsr()
Esempio n. 9
0
def lda_reduction(dataArray, k, get="feature-latent"):
    #print  ("dataArray", dataArray.shape)
    sparseDataArray = lil_matrix(dataArray)

    model = lda.LDA(n_topics=k, n_iter=200)
    model.fit(sparseDataArray)  # model.fit_transform(X) is also available
    topic_word = model.topic_word_  # model.components_ also works
    doc_topic = model.doc_topic_
    # print ("topic_word", topic_word.shape)
    # print ("dpc_topic", doc_topic.shape)

    if get == "feature-latent":
        #print ("topic_word:", topic_word.shape)
        #print ("doc_topic:", doc_topic.shape)
        #return topic_word
        return np.matmul(dataArray.transpose(), doc_topic)
    else:
        return np.matmul(dataArray, topic_word.transpose())
Esempio n. 10
0
def test_save_load_ldamodel_pickle():
    pfile = 'tests/data/test_pickle_unpickle_ldamodel.pickle'

    dtm = np.array([[0, 1], [2, 3], [4, 5], [6, 0]])
    doc_labels = ['doc_' + str(i) for i in range(dtm.shape[0])]
    vocab = ['word_' + str(i) for i in range(dtm.shape[1])]

    model = lda.LDA(2, n_iter=1)
    model.fit(dtm)

    lda_utils.common.save_ldamodel_to_pickle(pfile, model, vocab, doc_labels)

    unpickled = lda_utils.common.load_ldamodel_from_pickle(pfile)

    assert np.array_equal(model.doc_topic_, unpickled['model'].doc_topic_)
    assert np.array_equal(model.topic_word_, unpickled['model'].topic_word_)
    assert vocab == unpickled['vocab']
    assert doc_labels == unpickled['doc_labels']
Esempio n. 11
0
def lda_location():
    corpus = position_list

    # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频,sklen.countVectorizer()类能够把文档词块化
    vectorizer = CountVectorizer()
    # 统计每个行每个单词的词频
    x = vectorizer.fit_transform(corpus)

    # 这里toarray()和todense()结果一样,都是单词根据词典的分布。
    weight = x.toarray()
    model = lda.LDA(n_topics=10, n_iter=500, random_state=1)
    model.fit(
        numpy.asarray(weight))  # model.fit_transform(X) is also available
    # 文档-主题(Document-Topic)分布
    doc_topic = model.doc_topic_
    a = doc_topic
    numpy.savetxt('C:/Users/MyPC/Desktop/doc_location.csv', a,
                  delimiter=',')  # 将得到的文档-主题分布保存
Esempio n. 12
0
 def lda_topic_models(self, num_topics, num_iter, min_occ, docs):
     """ Extract LDA topic models """
     cvectorizer = CountVectorizer(min_df=min_occ, stop_words="english")
     cvz = cvectorizer.fit_transform(docs)
     lda_model = lda.LDA(n_topics=num_topics, n_iter=num_iter)
     X_topics = lda_model.fit_transform(cvz)
     _lda_keys = []
     for i in xrange(X_topics.shape[0]):
         _lda_keys.append(X_topics[i].argmax())
     topic_summaries = []
     topic_word = lda_model.topic_word_  # all topic words
     n_top_words = 5
     vocab = cvectorizer.get_feature_names()
     for i, topic_dist in enumerate(topic_word):
         topic_words = np.array(vocab)[np.argsort(
             topic_dist)][:-(n_top_words + 1):-1]  # get!
         topic_summaries.append(' '.join(topic_words))
     return topic_summaries
Esempio n. 13
0
def getItemTopic(trainset):

    item_review_df = getItemReview(trainset)
    doc_clean_set = [doc_clean(doc) for doc in item_review_df['reviews']]
    corpus, diction = getDict(doc_clean_set, len(diction))
    X = sparse2dense(corpus, diction)

    model = lda.LDA(n_topics=5, n_iter=20, random_state=1)
    model.fit(X)
    doc_topic = model.doc_topic_
    doc_topic_df = pd.DataFrame(doc_topic)

    #构建item_vector字典
    item_id_l = list(item_review_df['item'])
    doc_topic_l = doc_topic_df.as_matrix().tolist()

    item_vector_dict = dict(zip(item_id_l, doc_topic_l))
    return item_vector_dict
Esempio n. 14
0
 def fitLDA(self, nTopics, nTopWords):  #Fit LDA model
     topicsList = []
     tdm = textmining.TermDocumentMatrix(
         tokenizer=textmining.simple_tokenize_remove_stopwords)
     for index, row in self.typeData.iterrows():
         if isinstance(row["Title/Description"], basestring):
             tdm.add_doc(row["Title/Description"])
     temp = list(tdm.rows(cutoff=1))
     vocab = tuple(temp[0])
     X = np.array(temp[1:])
     self.model = lda.LDA(n_topics=nTopics, n_iter=500, random_state=1)
     self.model.fit_transform(X)
     topicWord = self.model.topic_word_  # model.components_ also works
     topWords = nTopWords
     for i, topic_dist in enumerate(topicWord):
         topicWords = np.array(vocab)[np.argsort(topic_dist)][:-topWords:-1]
         topicsList.append(topicWords)
     return topicsList
def test_get_marginal_topic_distrib(dtm, n_topics):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
    marginal_topic_distr = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths)

    assert marginal_topic_distr.shape == (n_topics,)
    assert np.isclose(marginal_topic_distr.sum(), 1.0)
    assert all(0 <= v <= 1 for v in marginal_topic_distr)
Esempio n. 16
0
def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_):
    dtm = np.array(dtm)
    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    vocab = np.array([chr(65 + i) for i in range(dtm.shape[1])
                      ])  # this only works for few words
    doc_lengths = tmtoolkit.bow.bow_stats.get_doc_lengths(dtm)

    topic_labels = model_stats.generate_topic_labels_from_top_words(
        model.topic_word_,
        model.doc_topic_,
        doc_lengths,
        vocab,
        lambda_=lambda_)
    assert isinstance(topic_labels, list)
    assert len(topic_labels) == n_topics

    for i, l in enumerate(topic_labels):
        assert isinstance(l, six.string_types)
        parts = l.split('_')
        assert len(parts) >= 2
        assert int(parts[0]) == i + 1
        assert all(w in vocab for w in parts[1:])

    topic_labels_2 = model_stats.generate_topic_labels_from_top_words(
        model.topic_word_,
        model.doc_topic_,
        doc_lengths,
        vocab,
        lambda_=lambda_,
        n_words=2)
    assert isinstance(topic_labels_2, list)
    assert len(topic_labels_2) == n_topics

    for i, l in enumerate(topic_labels_2):
        assert isinstance(l, six.string_types)
        parts = l.split('_')
        assert len(parts) == 3
        assert int(parts[0]) == i + 1
        assert all(w in vocab for w in parts[1:])
Esempio n. 17
0
def run_tm_and_dump(cand_year, files):
    db = get_cands_data('thesis_db.xls', DATA_LEN)
    engLines = get_translated_text("Translated_text.txt")

    engLines = engLines[:DATA_LEN]

    index = 0
    reviewed_cands = []
    cand_ids = []
    index2cand = {}
    run_text = []
    errors = [0, 0, 0, 0, 0, 0]
    for line in engLines:
        this_cand_id = db.ID_coded[index]
        if this_cand_id not in reviewed_cands:
            reviewed_cands.append(this_cand_id)
            cand = get_cand(db, engLines, index, [cand_year], errors)
            if cand is not None:
                run_text.append(line)
                cand_ids.append(cand.id)
                index2cand[index] = cand
        index = index + 1

    lem_text = get_data_lemmatized(run_text)
    id2word, corpus = text2corpus(lem_text)
    X2 = corpus2nparray(corpus, id2word)

    users, words, cands = get_users_and_words(db, engLines, cand_year,
                                              files[0])
    X = get_np_array(db, engLines, users, words, cand_year)

    model = lda.LDA(n_topics=N, n_iter=ITERATIONS, random_state=1)
    model.fit(X2)  # model.fit_transform(X) is also available
    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 8

    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(words)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        str = 'Topic {}: {}'.format(i, ' '.join(topic_words))
        print(str)

    #dump_tm(db, model.doc_topic_, users, cands, files)
    rr = 6
def lda_model(doc, topic, iterator=500):
    # 返回词汇表和训练好的lda模型
    word_set = set()
    print("转化doc_word")
    # 首先创建词汇表
    for d in doc:
        document = doc[d]
        document_word_list = document.split(" ")
        for w in document_word_list:
            word_set.add(w)

    # 创建document矩阵
    N = len(doc)
    V = len(word_set)
    data = []
    word_list = list(word_set)
    for d in doc:
        """
        将存在的词的编号和数量作为tuple存储
        N为文档数,V为词汇数
        之后将其转换为np N*V大小的矩阵
        """
        document = doc[d]
        document_word_list = document.split(" ")
        simple_list = []

        # 每个单词在该文档中包含数
        for i in range(len(word_list)):
            c = document_word_list.count(word_list[i])
            if c > 0:
                simple_list.append((i, document_word_list.count(word_list[i])))
        data.append(tuple(simple_list))

    # 创建矩阵
    dtm = np.zeros((N, V), dtype=np.intc)
    for i, doc in enumerate(data):
        for v, cnt in doc:
            np.testing.assert_equal(dtm[i, v], 0)  # 确认下以免出错
            dtm[i, v] = cnt

    print("训练lda模型")
    model = lda.LDA(n_topics=topic, n_iter=iterator, random_state=1)
    model.fit(dtm)
    return word_list, model
Esempio n. 19
0
    def score(self, source_corpus, target_corpus, weighting=None, pool=None):
        start = time.time()
        self.vector_extractor.estimate_idf(source_corpus, target_corpus)
        print "IDF extimation took %s seconds" % (time.time() - start)
        start = time.time()
        if self.lda_dim > 0:
            import lda
            doc_matrix = self.vector_extractor.extract(source_corpus +
                                                       target_corpus)
            lda_model = lda.LDA(n_topics=self.lda_dim,
                                n_iter=1500,
                                random_state=1,
                                refresh=200)
            lda_model.fit(doc_matrix.astype(int))
            del doc_matrix

        source_matrix = self.vector_extractor.extract(source_corpus)
        target_matrix = self.vector_extractor.extract(target_corpus)

        if self.lda_dim > 0:
            source_matrix = source_matrix.astype(int)
            target_matrix = target_matrix.astype(int)
            source_matrix = lda_model.transform(source_matrix)
            target_matrix = lda_model.transform(target_matrix)

        print "Extraction took %s seconds" % (time.time() - start)
        print "Nonzero source: ", len(source_matrix.nonzero()[0])
        print "Nonzero target: ", len(target_matrix.nonzero()[0])
        print "< 0 source: ", type(source_matrix).sum(source_matrix < 0)
        print "< 0 target: ", type(target_matrix).sum(target_matrix < 0)

        start = time.time()
        del self.vector_extractor
        n_jobs = 1
        if pool is not None:
            n_jobs = len(pool._pool)
        sys.stderr.write("Scoring using %s and %d jobs\n" %
                         (self.metric, n_jobs))
        d = 1 - pairwise_distances(
            source_matrix, target_matrix, metric=self.metric, n_jobs=n_jobs)
        # should not happen tfidf entries are negative
        print "< 0 d: ", np.sum(d < 0)
        print "Scoring took %s seconds" % (time.time() - start)
        return d
Esempio n. 20
0
def _test_LDA(l, path1, file='', data_samples=[], term=0):
    n_topics = 10
    n_top_words = 10
    if term == 7:
        n_top_words = 10
    elif term == 50:
        n_top_words = 100
    elif term == 100:
        n_top_words = 1000
    elif term == 200:
        n_top_words = 10000
    elif term == 400:
        n_top_words = 10000

    fileB = []
    fileB.append(file)
    #filepath = '/home/amrit/GITHUB/Pits_lda/dataset/'
    topics = []
    for j, file1 in enumerate(fileB):
        for i in range(10):
            #data_samples = readfile1(filepath + str(file1))

            # shuffling the list
            shuffle(data_samples)

            tf_vectorizer = CountVectorizer(max_df=0.95,
                                            min_df=2,
                                            stop_words='english')
            tf = tf_vectorizer.fit_transform(data_samples)

            lda1 = lda.LDA(n_topics=int(l[0]), alpha=l[1], eta=l[2], n_iter=10)

            lda1.fit_transform(tf)

            # print("done in %0.3fs." % (time() - t0))
            tf_feature_names = tf_vectorizer.get_feature_names()
            topics.extend(
                get_top_words(lda1,
                              path1,
                              tf_feature_names,
                              n_top_words,
                              i=i,
                              file1=file1))
    return topics
def get_train_test_lda(topic):
    model = VGG16(include_top=False, pooling='avg')

    x_train, y_train, x_test, y_test = load()

    x_train = x_train.astype('float32')
    x_train /= 255

    y_train = y_train.astype('int64')

    x_test = x_test.astype('float32')
    x_test /= 255
    y_test = y_test.astype('float32')

    X_train = model.predict(x_train)
    print(X_train.shape)
    X_test = model.predict(x_test)
    # X_train = model.predict(x_train)
    # X_test = model.predict(x_test)

    for k in topic:
        X_iter = X_train

        model_label = lda.LDA(n_topics=k, n_iter=1000)
        model_label.fit(y_train)
        doc_topic = model_label.doc_topic_
        x2 = doc_topic

        x = x2
        x = discretization_doc_topic(x)
        X_train = np.hstack((X_train, x))

        # multi-label learning to get x2
        classifier = LabelPowerset(RandomForestClassifier())
        classifier.fit(X_iter, x)

        x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray())
        # print(x)
        # x = alpha * x1 + (1-alpha) * x2
        # x = self.discretization_doc_topic(x)
        X_test = np.hstack((X_test, x))

    return np.array(X_train)[:, -28:], np.array(y_train), np.array(
        X_test)[:, -28:], np.array(y_test)
def get_topic_labels(n_topics, n_top_words, n_cand_labels, label_min_df,
                     n_labels, lda_random_state, lda_n_iter):
    """
    Refer the arguments to `create_parser`
    """
    print("Loading docs and preprocessing (cvalue etc) for lda input...")
    # docs = get_lda_input_from_corpus_folder(CORPUS_PATH)
    # docs = load_line_corpus(corpus_path)
    docs = pickle.load(open('./data/lda_input_docs_finalized.pickle', 'rb'))

    print("Generate candidate bigram labels(with POS filtering)...")
    finder = BigramLabelFinder(min_freq=label_min_df)
    cand_labels = finder.find(docs, top_n=n_cand_labels)

    print("Collected {} candidate labels".format(len(cand_labels)))

    print("Calculate the PMI scores...")

    pmi_cal = PMICalculator(doc2word_vectorizer=WordCountVectorizer(
        min_df=5, stop_words=load_stopwords(STOP_WORDS_FILES)),
                            doc2label_vectorizer=LabelCountVectorizer())

    pmi_w2l = pmi_cal.from_texts(docs, cand_labels)

    print("Topic modeling using LDA...")
    model = lda.LDA(n_topics=n_topics,
                    n_iter=lda_n_iter,
                    random_state=lda_random_state)
    model.fit(pmi_cal.d2w_)

    print("\nTopical words:")
    print("-" * 20)
    for i, topic_dist in enumerate(model.topic_word_):
        top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1]
        topic_words = [pmi_cal.index2word_[id_] for id_ in top_word_ids]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    ranker = LabelRanker(apply_intra_topic_coverage=False)

    return ranker.top_k_labels(topic_models=model.topic_word_,
                               pmi_w2l=pmi_w2l,
                               index2label=pmi_cal.index2label_,
                               label_models=None,
                               k=n_labels)
Esempio n. 23
0
def create_topic_vectors(path_json, save_path):    
    files, vectorizer, count_matrix, datas = convert_jsoncorpus_to_count_matrix(path_json)
    vocab = vectorizer.get_feature_names()

    print "Number of feature: ", len(vectorizer.get_feature_names())
    print "Features:", len(vocab), vocab[2500:2510], "..."
    print "Size of count matrix: ", count_matrix.shape
    print "Number of files: ", len(files)

    model = lda.LDA(n_topics=10, n_iter=2000, random_state=1)
    model.fit(count_matrix)  # model.fit_transform(X) is also available

    topic_word = model.topic_word_
    topic_vectors = [topic_dist for i, topic_dist in enumerate(topic_word)]

    with open(save_path, 'wb') as handle:
        pickle.dump([model, topic_vectors, vocab], handle)

    print "Topic vectors and vocab saved !!!"
Esempio n. 24
0
def exec_lda(mtx_lda, vocab_set, topics, words, iterations, path='./topic'):
    import numpy
    ft = io.open(path, 'w', encoding='utf8')
    model = lda.LDA(n_topics=topics, n_iter=iterations, random_state=1)
    model.fit(mtx_lda)
    topic_word = model.topic_word_
    n_top_words = words
    print topic_word
    for i, topic_dist in enumerate(topic_word):
        try:
            topic_words = numpy.array(vocab_set)[numpy.argsort(topic_dist)][:-(n_top_words + 1):-1]
        except IndexError as e:
            print (str(e))
        else:
            words = u''
            for word in topic_words:
                words += word
                words += ' '
            ft.write(str(i).encode('utf8') + u' ' + words + u'\n')
Esempio n. 25
0
def _test_LDA(l, path1, file='', data_samples=[], target=[]):
    n_topics = 10
    n_top_words = 10

    fileB = []
    fileB.append(file)
    #filepath = '/home/amrit/GITHUB/Pits_lda/dataset/'
    topics = []
    data = data_samples
    tar = target
    x = list(xrange(len(data_samples)))
    for j, file1 in enumerate(fileB):
        for i in range(10):
            #data_samples = readfile1(filepath + str(file1))

            # shuffling the list
            shuffle(x)
            data = [data[k] for k in x]
            tar = [tar[k] for k in x]

            tf_vectorizer = CountVectorizer(max_df=0.95,
                                            min_df=2,
                                            stop_words='english')
            tf = tf_vectorizer.fit_transform(data)

            lda1 = lda.LDA(n_topics=int(l[0]),
                           alpha=l[1],
                           eta=l[2],
                           n_iter=200)

            lda1.fit_transform(tf)
            tops = lda1.doc_topic_
            topic_word = lda1.topic_word_

            tf_feature_names = tf_vectorizer.get_feature_names()
            topics.extend(
                get_top_words(lda1,
                              path1,
                              tf_feature_names,
                              n_top_words,
                              i=i,
                              file1=file1))
    return topics, tops, topic_word, tf_feature_names, tar
Esempio n. 26
0
def LDAModel(train_count, vocab):
    # Topic modeling using LDA
    lda_model = lda.LDA(n_topics=10, n_iter=400)
    train_topics = lda_model.fit_transform(train_count)

    # Get a map between each user and the topic they most likely belong to
    _lda_keys = []
    for i in range(train_topics.shape[0]):
        _lda_keys += train_topics[i].argmax(),

    n_top_words = 5
    topic_summaries = []
    topic_word = lda_model.topic_word_  # all topic words
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]  # get!
        topic_summaries.append(' '.join(topic_words))  # append!

    return (lda_model, train_topics, _lda_keys, topic_summaries)
Esempio n. 27
0
def lda_out(doc_term_mat, vocab, directory, outfile_prefix, num_topics,
            n_top_words, date_range):
    model = lda.LDA(n_topics=num_topics, n_iter=1500, random_state=1)
    model.fit(doc_term_mat)
    topic_word = model.topic_word_
    tpc_wds_file = directory + outfile_prefix + 'tpc_wds' + date_range + '.mat'
    doc_tpc_file = directory + outfile_prefix + 'doc_tpc' + date_range + '.mat'

    matrix_dump(topic_word, tpc_wds_file)
    matrix_dump(model.doc_topic_, doc_tpc_file)

    with open(
            directory + outfile_prefix + str(num_topics) + date_range + '.txt',
            'w+') as f:
        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(
                sorted(vocab))[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
            f.write('Topic {0} : {1}\n'.format(
                i, ', '.join(topic_words).encode("utf-8")))
def test_get_word_distinctiveness(dtm, n_topics):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
    p_t = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths)

    w_distinct = model_stats.word_distinctiveness(model.topic_word_, p_t)

    assert w_distinct.shape == (dtm.shape[1],)
    assert all(v > -1e10 for v in w_distinct)
Esempio n. 29
0
 def fit_models(self, k_list, n_iter=500):
     """
     Fits multiple LDA models to X. Implements <lda> module.
     
     k_list = [10, 20, 25, ..., 90]
     """
     self.k_list = k_list
     self.topics_n = sum(k_list)
     models_k = reduce(lambda x, y: x + y, [[k] * k for k in self.k_list])
     for i in k_list:
         for j in range(0, i):
             self.topic_labels.append(str(i) + "-" + str(j + 1))
     self.models_matrix = np.matrix([0] * len(self.features))
     for k in k_list:
         model = lda.LDA(n_topics=k, n_iter=n_iter, random_state=1)
         model.fit(self.X)
         self.models_list.append(model)
         self.models_matrix = np.vstack((self.models_matrix, model.nzw_))
     self.models_matrix = self.models_matrix[1:]
Esempio n. 30
0
    def test_lda_random_seed(self):
        dtm = self.dtm
        doc_topic = self.doc_topic
        n_iter = self.n_iter
        n_topics = self.n_topics
        random_seed = self.random_seed
        random_state = self.model.random_state

        # refit model with same random seed and verify results identical
        model_new = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed)
        rands_init = model_new._rands.copy()
        doc_topic_new = model_new.fit_transform(dtm)
        rands_fit = model_new._rands.copy()
        random_state_new = model_new.random_state
        np.testing.assert_array_equal(doc_topic_new, doc_topic)
        np.testing.assert_array_equal(random_state_new, random_state)

        # verify random variates are not changed
        np.testing.assert_array_equal(rands_init, rands_fit)