Beispiel #1
0
def main():

    lda0 = models.LdaModel.load('./timewindow_in3/_1999-2000lda_model')
    lda1 = models.LdaModel.load('./timewindow_in3/_2000-2001-2002lda_model')
    lda2 = models.LdaModel.load('./timewindow_in3/_2002-2003-2004lda_model')
    lda3 = models.LdaModel.load('./timewindow_in3/_2004-2005-2006lda_model')
    lda4 = models.LdaModel.load('./timewindow_in3/_2006-2007-2008lda_model')
    lda5 = models.LdaModel.load('./timewindow_in3/_2008-2009-2010lda_model')
    lda6 = models.LdaModel.load('./timewindow_in3/_2010-2011-2012lda_model')
    lda7 = models.LdaModel.load('./timewindow_in3/_2012-2013-2014lda_model')
    lda8 = models.LdaModel.load('./timewindow_in3/_2014-2015-2016lda_model')
    lda9 = models.LdaModel.load('./timewindow_in3/_2016-2017lda_model')

    LDA_list = [lda0, lda1, lda2, lda3, lda4, lda5, lda6, lda7, lda8, lda9]

    corpus_exam0 = corpora.BleiCorpus("./timewindow_in3/corpus_1999-2000.blei")
    corpus_exam1 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2000-2001-2002.blei")
    corpus_exam2 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2002-2003-2004.blei")
    corpus_exam3 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2004-2005-2006.blei")
    corpus_exam4 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2006-2007-2008.blei")
    corpus_exam5 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2008-2009-2010.blei")
    corpus_exam6 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2010-2011-2012.blei")
    corpus_exam7 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2012-2013-2014.blei")
    corpus_exam8 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2014-2015-2016.blei")
    corpus_exam9 = corpora.BleiCorpus("./timewindow_in3/corpus_2016-2017.blei")

    corpus_list = [
        corpus_exam0, corpus_exam1, corpus_exam2, corpus_exam3, corpus_exam4,
        corpus_exam5, corpus_exam6, corpus_exam7, corpus_exam8, corpus_exam9
    ]

    # for (lda, corpus) in zip(LDA_list, corpus_list):
    #     # print num_doc_per_topic(corpus, lda)
    #     print lda
    #     print doc_topic_mat(corpus, lda)
    list_all = []
    for i in range(10):
        # print i
        list_t = doc_topic_mat(corpus_list[i], LDA_list[i])
        # print list_t
        list_all.append(list_t)
    list_np = np.array(list_all)

    # print type(list_all), type(list_all[0]),type(list_all[0][0])
    print "******************************************************"
Beispiel #2
0
def train(reviewDict, k):
    '''Feed reviews to LDA model using k topics'''

    id2word = corpora.Dictionary(reviewDict[review]["review_nouns"]
                                 for review in reviewDict)
    # id2word.filter_extremes(keep_n=10000)
    # id2word.compactify()

    corpora_dict = corpora.Dictionary(reviewDict[review]["review_nouns"]
                                      for review in reviewDict)
    corpora_dict.save('lda/dictionary.dict')

    corpus = [
        corpora_dict.doc2bow(reviewDict[review]["review_nouns"])
        for review in reviewDict
    ]
    corpora.BleiCorpus.serialize('lda/corpus.lda-c', corpus)
    corpus = corpora.BleiCorpus('lda/corpus.lda-c')

    if k == 50:
        # save lda model for 50 topics
        lda = gensim.models.LdaModel(corpus, num_topics=50, id2word=id2word)
        lda.save('lda/lda_50_topics.lda')

    elif k == 25:
        # save lda model for 25 topics
        lda = gensim.models.LdaModel(corpus, num_topics=25, id2word=id2word)
        lda.save('lda/lda_25_topics.lda')

    return lda
Beispiel #3
0
def main():
    global REVIEW_DICT
    REVIEW_DICT = reviewData

    # Check if the folder for the lda model exists. If it doesnt create the folder
    if not os.path.exists('lda'):
        os.makedirs('lda')

    train(lemmatize(REVIEW_DICT), 25)

    # Get all 25 topics using K=25
    WEIGHT_TOPIC = []

    dictionary_path = "lda/dictionary.dict"
    corpus_path = "lda/corpus.lda-c"
    lda_model_path = "lda/lda_25_topics.lda"

    dictionary = corpora.Dictionary.load(dictionary_path)
    corpus = corpora.BleiCorpus(corpus_path)
    lda = LdaModel.load(lda_model_path)

    # Print topics
    TOPIC_DICT = dict(lda.show_topics(num_topics=25))

    for topicN, topicWeights in TOPIC_DICT.items():
        print('Topic ' + str(topicN) + ' : \n' + str(topicWeights) + '\n')
Beispiel #4
0
 def load_corpus(self, override_corpus=True):
     if self.corpus_format == 'svmlight':  # Joachim's SVMlight format
         try:
             c = corpora.SvmLightCorpus(fname=(DOCUMENT_PATH + self.filename + '_corpus.' + self.corpus_format))
             if override_corpus is True:
                 self.save_corpus(c)
         except:
             c = corpora.SvmLightCorpus
             self.save_corpus(c)
     elif self.corpus_format == 'lda-c':  # Blei's LDA-C format
         try:
             c = corpora.BleiCorpus(fname=(DOCUMENT_PATH + self.filename + '_corpus.' + self.corpus_format))
             if override_corpus is True:
                 self.save_corpus(c)
         except:
             c = corpora.BleiCorpus
             self.save_corpus(c)
     elif self.corpus_format == 'low':  # GibbsLDA++ format
         try:
             c = corpora.LowCorpus(fname=(DOCUMENT_PATH + self.filename + '_corpus.' + self.corpus_format))
             if override_corpus is True:
                 self.save_corpus(c)
         except:
             c = corpora.LowCorpus
             self.save_corpus(c)
     else:  # Default Market Matrix format
         try:
             c = corpora.MmCorpus(fname=(DOCUMENT_PATH + self.filename + '_corpus.' + self.corpus_format))
             if override_corpus is True:
                 self.save_corpus(c)
         except:
             c = corpora.MmCorpus
             self.save_corpus(c)
     return c
Beispiel #5
0
def main():

    lda1 = models.LdaModel.load('./period_static/_buddinglda_model')
    lda2 = models.LdaModel.load('./period_static/_growinglda_model')
    lda3 = models.LdaModel.load('./period_static/_maturelda_model')
    lda_list = [lda1, lda2, lda3]

    corpus0 = corpora.BleiCorpus("./period_static/corpus_budding.blei")
    corpus1 = corpora.BleiCorpus("./period_static/corpus_growing.blei")
    corpus2 = corpora.BleiCorpus("./period_static/corpus_mature.blei")
    corpus_list = [corpus0, corpus1, corpus2]

    cos_sim(lda_list)
    semantic_sim(lda_list)
    num_per_topic(lda_list, corpus_list)
    density(lda_list, corpus_list)
    print_topic(lda_list)
Beispiel #6
0
def displayTopics():
    dictionary = corpora.Dictionary.load(dictionary_path)
    corpus = corpora.BleiCorpus(corpus_path)
    lda = LdaMulticore.load(lda_model_path)
    i = 0
    for topic in lda.show_topics(lda_num_topics):
        print 'Topic #' + str(i) + ': ' + str(topic)
        i += 1
Beispiel #7
0
    def run(lda_model_path, corpus_path, num_topics, id2word):
        corpus = corpora.BleiCorpus(corpus_path)
        lda = gensim.models.LdaModel(corpus,
                                     num_topics=num_topics,
                                     id2word=id2word)
        lda.save(lda_model_path)

        return lda
Beispiel #8
0
def load():
    dictionary = corpora.Dictionary.load('./tmp/all_doucment.dict')
    corpus = corpora.BleiCorpus('./tmp/corpus.blei')

    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10, per_word_topics=True)
    print lda.id2word
    print '----------------------------------------------------'
    list = lda.get_document_topics(corpus, per_word_topics=True)
    pprint(list[2])
 def run(lda_model_path, corpus_path, num_topics, id2word):
     u'''Training to create LDA model'''
     corpus = corpora.BleiCorpus(corpus_path)
     lda = gensim.models.LdaModel(corpus,
                                  num_topics=num_topics,
                                  id2word=id2word,
                                  iterations=200)
     lda.save(lda_model_path)
     return lda
Beispiel #10
0
 def run(lda_model_path, corpus_path, num_topics, id2word):
     corpus = corpora.BleiCorpus(corpus_path)
     lda = gensim.models.LdaModel(corpus,
                                  num_topics=num_topics,
                                  id2word=id2word,
                                  passes=10,
                                  eval_every=10,
                                  iterations=500)
     lda.save(lda_model_path)
     return lda
Beispiel #11
0
def vis_lda(load_path, output_html=None):
    print "Visualizing the LDA"
    if load_path is None:
        load_path = "LDA_data"
    if output_html is None:
        output_html = "LDA_vis.html"
    corpus = corpora.BleiCorpus(load_path + "_corp")
    id2word = corpora.Dictionary.load_from_text(load_path + "_dic")
    lda = models.ldamodel.LdaModel.load(load_path + "_lda")
    vis = pyLDAvis.gensim.prepare(lda, corpus, id2word)
    pyLDAvis.save_html(vis, output_html)
Beispiel #12
0
def build_lda_mode():
    # corpus is bag of words, which is the original feature
    corpus = corpora.BleiCorpus(
        './zhihu_dat/item.dat')  # the bag of words feature of question data

    # build up lda model: using lda model, given a bag of words feature, return the topic feature, so the topic model is to reduce the dimension of the features of  a document
    lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=10)

    # save the model to disk for future use(Given a document such as question, return the topic feature of the document)

    lda_model.save('./zhihu_dat/zhihu_10.lda')
    print 'Building complete'
Beispiel #13
0
def main():
    lda0 = models.LdaModel.load('./timewindow_in3/_1999-2000lda_model')
    lda1 = models.LdaModel.load('./timewindow_in3/_2000-2001-2002lda_model')
    lda2 = models.LdaModel.load('./timewindow_in3/_2002-2003-2004lda_model')
    lda3 = models.LdaModel.load('./timewindow_in3/_2004-2005-2006lda_model')
    lda4 = models.LdaModel.load('./timewindow_in3/_2006-2007-2008lda_model')
    lda5 = models.LdaModel.load('./timewindow_in3/_2008-2009-2010lda_model')
    lda6 = models.LdaModel.load('./timewindow_in3/_2010-2011-2012lda_model')
    lda7 = models.LdaModel.load('./timewindow_in3/_2012-2013-2014lda_model')
    lda8 = models.LdaModel.load('./timewindow_in3/_2014-2015-2016lda_model')
    lda9 = models.LdaModel.load('./timewindow_in3/_2016-2017lda_model')

    LDA_list = [lda0, lda1, lda2, lda3, lda4, lda5, lda6, lda7, lda8, lda9]

    corpus_exam0 = corpora.BleiCorpus("./timewindow_in3/corpus_1999-2000.blei")
    corpus_exam1 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2000-2001-2002.blei")
    corpus_exam2 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2002-2003-2004.blei")
    corpus_exam3 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2004-2005-2006.blei")
    corpus_exam4 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2006-2007-2008.blei")
    corpus_exam5 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2008-2009-2010.blei")
    corpus_exam6 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2010-2011-2012.blei")
    corpus_exam7 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2012-2013-2014.blei")
    corpus_exam8 = corpora.BleiCorpus(
        "./timewindow_in3/corpus_2014-2015-2016.blei")
    corpus_exam9 = corpora.BleiCorpus("./timewindow_in3/corpus_2016-2017.blei")

    corpus_list = [
        corpus_exam0, corpus_exam1, corpus_exam2, corpus_exam3, corpus_exam4,
        corpus_exam5, corpus_exam6, corpus_exam7, corpus_exam8, corpus_exam9
    ]

    for (lda, corpus) in zip(LDA_list, corpus_list):
        print cal_density(corpus, lda)
def load_lda_corpus():

    corpus_Lda = corpora.BleiCorpus(
        "tmp/corpus_stories.lda-c")  #carico il corpus precedentemente salvato
    # stampa una serie di array aventi 100 coppie corrispondenti in questo caso ai 100 topic (se un topic non è per
    # niente presente in un documento la coppia non verrà mostrata affatto)
    # (se ci fossero tot topic stamperebbe tot coppie, ma solo se effettivamente nei documenti sono presenti)
    # in ciascuna delle coppie: (id_topic, distribuzione del documento su quel topic quindi più alto è il valore
    # e più il documento parlerà di quel tipic)

    # for doc in corpus_Lda[:10]: #prende i primi 10 in base a come sono stati inseriti nel corpus
    #    print(doc)

    return corpus_Lda
Beispiel #15
0
def main():
    #Loads dataset and vocabulary
    #Generating topics and distributions
    print('Starting up!')
    num_topics = 10
    num_question = 50
    num_answer = 50985
    corpus_name = '../data/ldac/deriv/sq50.txt'
    voc_name = '../data/voc/deriv/all.txt'
    corpus = corpora.BleiCorpus(corpus_name, voc_name)
    print('Corpus processed!')
    #id2word = corpora.Dictionary.load('../data/voc_2.txt')
    lda = models.ldamodel.LdaModel(corpus,
                                   num_topics=num_topics,
                                   chunksize=2000,
                                   decay=0.5,
                                   offset=1.0,
                                   passes=1,
                                   update_every=0,
                                   eval_every=10,
                                   iterations=20000,
                                   gamma_threshold=0.001)
    print('LDA applied to corpus!')

    #print('=== Topic-Word Distributions ===')
    #topic_word_list = lda.show_topics()

    #for i in xrange(num_topics):
    #	print('Topic {} : {}'.format(i,' '.join(topic_word_list[i][0:])))

    print('=== Document-Topic Distributions ===')
    #Writing doc_topic distributions to a file to parse later
    #doc_topic = lda[corpus]
    dist_list = list(lda[corpus])
    answer_list, metric_list = compare_sq(dist_list, num_question, num_answer,
                                          num_topics)

    print('Answers compared!')
    answer_list
    f = open('{}{}'.format(corpus_name, '.answers'), 'w')

    print('Answers for each question.')
    for i, answers in enumerate(answer_list):
        answers
        f.write('Question {}: '.format(i))
        for j in xrange(50):
            f.write('{} '.format(answers[j]))
        f.write('\n')
    f.close()
    print('Results written !')
Beispiel #16
0
def nbc():
    choose_1 = random.randint(0, 25)
    # choose_2 = random.randint(0, 25)
    corpus1 = corpora.BleiCorpus('../corpus/corpus_{}.blei'.format(choose_1))
    corpus1 = corpora.BleiCorpus('../corpus/corpus_0.blei')
    # corpus_2 = corpora.BleiCorpus('../corpus/corpus_{}.blei'.format(choose_1))
    test_X = matutils.corpus2csc(corpus1).transpose()  # 测试集
    # print test_X.get_shape()
    label_list = read_label()
    test_y = label_list[(choose_1*20000):(choose_1+1)*20000]  # 测试集标签
    test_y = label_list[(0 * 20000):(0 + 1) * 20000]
    clf = MultinomialNB(alpha=0.01)
    for index in range(0, 25):
        corpus = corpora.BleiCorpus('../corpus/corpus_{}.blei'.format(index))
        csi_matrix = matutils.corpus2csc(corpus).transpose()
        if csi_matrix.get_shape() ==(20000, 271884):
            print(csi_matrix.get_shape())
            clf.partial_fit(csi_matrix,
                            label_list[(index*20000):(index+1)*20000],
                            classes=np.array([0, 1]))
            print("第{}次".format(index))
            pre = clf.predict(test_X)
            totalScore(pre, test_y)
def run_lsi(dictionary_file, ldac_file, lsi_file, topics_file, num_topics,
            paper_ids):

    dictionary = corpora.Dictionary().load(dictionary_file)
    corpus_ldac = corpora.BleiCorpus(fname=ldac_file,
                                     fname_vocab=(ldac_file + '.vocab'))
    num_docs = len(corpus_ldac)
    '''
    Writes the corpus-documents TFIDF values into a file 
    '''

    tfidf_mdl = models.TfidfModel(corpus_ldac)
    corpus_tfidf = tfidf_mdl[corpus_ldac]

    lsi = models.LsiModel(corpus_tfidf,
                          id2word=dictionary,
                          num_topics=num_topics)
    corpus_lsi = lsi[corpus_tfidf]

    lsi_matrix = np.zeros((num_docs, num_topics))
    row_count = 0
    for doc in corpus_lsi:
        for each_tuple in doc:
            lsi_matrix[row_count, int(each_tuple[0])] = float(each_tuple[1])
        row_count += 1

    with codecs.open(lsi_file, mode='w', encoding='utf-8') as fw:
        for i in range(0, num_docs):
            fw.write(str(paper_ids[i]) + u'|{')
            s = ''
            for j in range(0, num_topics):
                s += str(lsi_matrix[i, j]) + u','
            fw.write(s.rstrip(u',') + u'}\n')


#    np.savetxt(lsi_file, lsi_matrix)

    print 'Number of documents: ', row_count

    topics = lsi.show_topics(num_topics=-1,
                             num_words=50,
                             log=False,
                             formatted=False)
    with codecs.open(topics_file, mode='w', encoding='utf-8') as fw:
        fw.write(u'topic_id|topic_words\n')
        for i in range(0, num_topics):
            topic_words = u",".join(w[1] for w in topics[i])
            fw.write(str(i + 1) + u"|{" + topic_words + u"}\n")
    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        dictionary_path = "models/dictionary.dict"
        corpus_path = "models/corpus.lda-c"
        lda_num_topics = 50
        lda_model_path = "models/lda_model_50_topics.lda"

        dictionary = corpora.Dictionary.load(dictionary_path)
        corpus = corpora.BleiCorpus(corpus_path)
        lda = LdaModel.load(lda_model_path)

        i = 0
        for topic in lda.show_topics(num_topics=lda_num_topics):
            print '#' + str(i) + ': ' + str(topic)
            i += 1
Beispiel #19
0
def predTopics(review_text):
    separated_text = review_text.lower().split()

    # apply LDA model
    dictionary_path = "lda/dictionary.dict"
    corpus_path = "lda/corpus.lda-c"

    lda_model_path = "lda/lda_25_topics.lda"

    dictionary = corpora.Dictionary.load(dictionary_path)
    corpus = corpora.BleiCorpus(corpus_path)
    lda = LdaModel.load(lda_model_path)

    review_bow = dictionary.doc2bow(separated_text)

    return lda[review_bow]
def run_lda(dictionary_file, ldac_file, theta_file, topics_file, num_topics,
            num_passes, paper_ids):

    dictionary = corpora.Dictionary().load(dictionary_file)
    corpus_ldac = corpora.BleiCorpus(fname=ldac_file,
                                     fname_vocab=(ldac_file + '.vocab'))
    num_docs = len(corpus_ldac)

    model = models.ldamodel.LdaModel(corpus_ldac,
                                     id2word=dictionary,
                                     num_topics=num_topics,
                                     passes=num_passes,
                                     update_every=0,
                                     alpha=1.0,
                                     eta=1.0,
                                     decay=0.0)

    # Creates the \theta matrix
    theta = model[corpus_ldac]
    theta_matrix = np.zeros((num_docs, num_topics))
    dcount = 0
    for theta_d in theta:
        for theta_dt in theta_d:
            theta_matrix[dcount, int(theta_dt[0])] = float(theta_dt[1])
        dcount += 1

    with codecs.open(theta_file, mode='w', encoding='utf-8') as fw:
        for i in range(0, num_docs):
            fw.write(str(paper_ids[i]) + u'|{')
            s = ''
            for j in range(0, num_topics):
                s += str(theta_matrix[i, j]) + u','
            fw.write(s.rstrip(u',') + u'}\n')

    # np.savetxt(theta_file, theta_matrix)

    print 'Number of documents: ', dcount

    topics = model.show_topics(topics=-1, topn=50, log=False, formatted=False)
    with codecs.open(topics_file, mode='w', encoding='utf-8') as fw:
        fw.write(u'topic_id|topic_words\n')
        for i in range(0, num_topics):
            topic_words = u",".join(w[1] for w in topics[i])
            fw.write(str(i + 1) + u"|{" + topic_words + u"}\n")
Beispiel #21
0
def build_model(dat, vocab, num_topics=100, alpha=None):
    """
    loading items & training lda model
    """
    if not path.exists(dat) or not path.exists(vocab):
        print('Error: Expected items to be present at ./datasets/ap/')
    # Corpus is just the preloaded list of words
    # dat:term_num term_id:term_freq ... for each line
    # vocab:term for each line(line no is term_id implicit)
    corpus = corpora.BleiCorpus(
        dat, vocab)  # class 'gensim.corpora.bleicorpus.BleiCorpus
    # #doc_line=2246 #vocab=10473
    # print(corpus.id2word)   #{0: 'line', 1: 'new', 2: 'percent'...} same as vocab:word_id:word
    model = models.LdaModel(corpus,
                            num_topics=num_topics,
                            alpha=alpha,
                            id2word=corpus.id2word)
    # corpus_list = [c for c in corpus]
    # print 'len(corpus_list[0]) = ', len(corpus_list[0]), '\n', corpus_list[0] #<=>ap.dat line 1
    return corpus, model
Beispiel #22
0
def main():

    names = [
        '2006-2007-2008', '2008-2009-2010', '2010-2011-2012', '2012-2013-2014',
        '2014-2015-2016'
    ]
    path = "Output/"
    lda_list = []
    corpus_list = []
    for name in names:
        lda = models.LdaModel.load('Corpus3/lda_model_' + name)
        corpus = corpora.BleiCorpus('Corpus3/corpus_' + name + '.blei')
        lda_list.append(lda)
        corpus_list.append(corpus)

    num_per_topic(lda_list, corpus_list, path)
    density(lda_list, corpus_list, path)
    print_topic(lda_list, path)
    cos_sim(lda_list, path)
    semantic_sim(lda_list, path)
Beispiel #23
0
    def load_corpus(self, directory=None):
        print("[%s] Load corpus ..." % self._name)
        name = self._name
        ext = self._model_type
        dir = "%s/" % (directory if directory else ".")

        self._wdict = corpora.Dictionary.load_from_text(dir + name + ".dict")

        if self._model_type == "lda":
            self._topic_model = models.LdaModel.load(dir + name + "." + ext)
        elif self._model_type == "lsi":
            self._topic_model = models.LsiModel.load(dir + name + "." + ext)
        elif self._model_type == "hdp":
            self._topic_model = models.HdpModel.load(dir + name + "." + ext)
        else:
            self._topic_model = NullModel.load(dir + name + "." + ext)

        self._corpus = corpora.BleiCorpus(self._corpus, dir + name + ".blei")
        self._index = similarities.MatrixSimilarity.load(dir + name + ".index")
        self._file_names = load_list(dir + name + ".file_names")
        self._topic_names = load_list(dir + name + ".topic_names")
Beispiel #24
0
def runLDA(corpusfile, dcyfile, num_topics, ind=-1):
    '''
    Do classical LDA on word matrix M using alpha, beta
    Plot the results
    '''
    print("Running Vanilla LDA on current M")
    dcy = corpora.Dictionary.load(dcyfile)
    print(dcy)
    if ind > 0:
        tmp = dcy.token2id
        for key in tmp:
            if tmp[key] == int(ind):
                print('Word to insert: ' + key)
                break

    corpus = corpora.BleiCorpus(corpusfile)
    #tfidf = models.TfidfModel(corpus, normalize=True)
    #tfidf_corpus = tfidf[corpus]
    tfidf_corpus = corpus  #Remove this line to allow tfidf values
    lda = models.LdaModel(tfidf_corpus, id2word=dcy, num_topics=num_topics)
    print(lda.print_topics(num_topics, num_words=20))
    return 0
Beispiel #25
0
def get_lda_models():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    input_file = "nostem_processed_normal_format_Homework2_data.csv"
    documents_list = []
    for line in open(input_file, "r"):
        documents_list.append(line)
    texts = [[word for word in document.lower().split(" ")]
             for document in documents_list]
    dictionary = corpora.Dictionary(texts)
    dictionary.save('temp_twitter.dict')
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.BleiCorpus.serialize('temp_twitter_lda.mm', corpus)
    print "Generate the dictionary and lda corpus"

    if (os.path.exists("temp_twitter.dict")):
        dictionary = corpora.Dictionary.load('temp_twitter.dict')
        corpus = corpora.BleiCorpus('temp_twitter_lda.mm')
        print("Used files generated from first tutorial")

    #tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
    #corpus_tfidf = tfidf[corpus]

    #get the lda model
    lda_3_topics = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                   id2word=dictionary,
                                                   num_topics=3)
    lda_4_topics = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                   id2word=dictionary,
                                                   num_topics=4)
    lda_5_topics = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                   id2word=dictionary,
                                                   num_topics=5)
    #lda.print_topics(num_topics=5, num_words=10)
    lda_3_topics.save("lda_3_topics_model")
    lda_4_topics.save("lda_4_topics_model")
    lda_5_topics.save("lda_5_topics_model")
Beispiel #26
0
import numpy as np


# **Download data**
# 
# <del>http://www.cs.princeton.edu/~blei/lda-c/ap.tgz</del>
# 
# http://www.cs.columbia.edu/~blei/lda-c/
# 
# Unzip the data and put them into your folder, e.g., /Users/datalab/bigdata/ap/

# In[23]:


# Load the data
corpus = corpora.BleiCorpus('/Users/datalab/bigdata/ap/ap.dat',                            '/Users/datalab/bigdata/ap/vocab.txt')


# **使用help命令理解corpora.BleiCorpus函数**
# 
# > help(corpora.BleiCorpus)
 class BleiCorpus(gensim.corpora.indexedcorpus.IndexedCorpus)
 |  Corpus in Blei's LDA-C format.
 |  
 |  The corpus is represented as two files: 
 |          one describing the documents, 
 |          and another describing the mapping between words and their ids.
# In[24]:


# 使用dir看一下有corpus有哪些子函数?
Beispiel #27
0
def getdata():
    #Getting the sample corpus
    corpus = corpora.BleiCorpus('D:/Projects/ap/ap.dat',
                                'D:/Projects/ap/vocab.txt')
    return corpus
Beispiel #28
0
from gensim import corpora
from collections import defaultdict
from pprint import pprint
from gensim.models import ldamodel
import numpy as np
import pandas as pd
import os
import json
from scipy.stats import entropy
from scipy.spatial.distance import pdist, squareform
from nltk.corpus import stopwords

model = ldamodel.LdaModel.load("lda_output/reclamos_lda")
dictionary = corpora.Dictionary.load('corpus_output/corpus_dict.dict')
corpus = corpora.BleiCorpus('corpus_output/corpus.lda-c')
n_topic = 10

index = 0
predict = model[corpus]


def get_doc_topic_dists(predict):
    doc_topic_dists = np.empty([len(corpus), n_topic])
    index = 0
    for topics in predict:
        for topic in topics:
            doc_topic_dists[index][topic[0]] = topic[1]
        index += 1
    np.savetxt('doc_topic.csv', doc_topic_dists, delimiter=',')
    return doc_topic_dists
Beispiel #29
0
    print("Please install it")
    raise

import matplotlib.pyplot as plt
import numpy as np
from os import path

NUM_TOPICS = 100

# Check that data exists
if not path.exists('./data/ap/ap.dat'):
    print('Error: Expected data to be present at data/ap/')
    print('Please cd into ./data & run ./download_ap.sh')

# Load the data
corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt')

# Build the topic model
model = models.ldamodel.LdaModel(corpus,
                                 num_topics=NUM_TOPICS,
                                 id2word=corpus.id2word,
                                 alpha=None)

# Iterate over all the topics in the model
for ti in range(model.num_topics):
    words = model.show_topic(ti, 64)
    tf = sum(f for f, w in words)
    with open('topics.txt', 'w') as output:
        output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf))
                               for f, w in words))
        output.write("\n\n\n")
__author__ = 'askofen'
from gensim import corpora, models, similarities
import numpy as np
from scipy.spatial import distance
import matplotlib.pyplot as plt

corpus = corpora.BleiCorpus('../../Data/04/data/ap.dat',
                            '../../Data/04/data/vocab.txt')

# alpha = 1 -> more topics per document
model = models.ldamodel.LdaModel(corpus,
                                 num_topics=100,
                                 id2word=corpus.id2word)

# length of vocabulary dictionary
print(len(model.id2word))

topics = [model[topic] for topic in corpus]
topicsProDocCount = [len(t) for t in topics]

plt.hist(topicsProDocCount, bins=15)
plt.title("Topics pro document histogram")

dense = np.zeros((len(topics), 100), float)

# make matrix with weight of each topic for each document
for ti, t in enumerate(topics):
    for tj, v in t:
        dense[ti, tj] = v

#distance between all the rows in the matrix