Beispiel #1
0
    def run(self,
            kappa=1.0,
            tau=64.0,
            K=15,
            T=150,
            alpha=1,
            gamma=1,
            eta=0.01,
            scale=1.0,
            var_converge=0.0001,
            outputdir=None,
            random_state=0,
            *args,
            **kwargs):

        self.model = HdpModel(corpus=self.corpus,
                              id2word=self.dictionary,
                              kappa=kappa,
                              tau=tau,
                              K=K,
                              T=T,
                              alpha=alpha,
                              gamma=gamma,
                              eta=eta,
                              scale=scale,
                              var_converge=var_converge,
                              outputdir=outputdir,
                              random_state=random_state,
                              *args,
                              **kwargs)

        print("Done!\nCheckout hdp.model")
Beispiel #2
0
def getRelationDetailByHDP(sentence_list):
    # 聚类获取结果
    corpus = []
    pairs_all, position_all = segmentor.segListWithNerTag(sentence_list)
    words_list = []
    for pairs in pairs_all:
        word_list = []
        for pair in pairs:
            if pair.flag.__contains__("v") or pair.flag.__contains__("n"):
                word_list.append(pair.word)
        words_list.append(word_list)
    # words_list = list(map(lambda pairs: map(lambda x: x.word, pairs), pairs_all))
    from gensim import corpora
    dictionary = corpora.Dictionary(words_list)
    for words in words_list:
        corpus.append(dictionary.doc2bow(words))
    from gensim.models import HdpModel
    hdp = HdpModel(corpus, dictionary)
    a = hdp.print_topics()
    words = {}
    for topic in a:
        word_details = str(topic[1]).split(" + ")
        for word_detail in word_details:
            word = str(word_detail[word_detail.index("*") + 1:])
            num = float(str(word_detail[:word_detail.index("*")]))
            if not (words.__contains__(word)):
                words[word] = num
            else:
                words[word] += num
    words = sorted(words.items(), key=lambda d: d[1])
    return words  # 后获取句法分析中的高频动词名词)
Beispiel #3
0
    def model_pcs(self, model_name, all_mashup_num, all_api_num):
        # 按照0-all——num得到的其实是按真实id的映射!!!
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if model_name == 'HDP':
            self.model = HdpModel(self.mashup_dow + self.api_dow, self.dct)
            self.num_topics = self.model.get_topics().shape[0]
        elif model_name == 'TF_IDF':
            self.model = TfidfModel(self.mashup_dow + self.api_dow)
            self.num_topics = len(self.dct)
        else:
            raise ValueError('wrong gensim_model name!')

        mashup_hdp_features = [
            self.model[mashup_info] for mashup_info in self.mashup_dow
        ]
        api_hdp_features = [self.model[api_info] for api_info in self.api_dow]

        self._mashup_hdp_features = np.zeros((all_mashup_num, self.num_topics))
        self._api_hdp_features = np.zeros((all_api_num, self.num_topics))
        for i in range(all_mashup_num):
            for index, value in mashup_hdp_features[i]:
                self._mashup_hdp_features[i][index] = value
        for i in range(all_api_num):
            for index, value in api_hdp_features[i]:
                self._api_hdp_features[i][index] = value
        return self._mashup_hdp_features, self._api_hdp_features
Beispiel #4
0
def build_lda_models(course_corpus, course_dictionary, mapping, course_texts):
    # ==== Train Unsupervised LDA ====
    lda_model = LdaModel(corpus=course_corpus, id2word=course_dictionary)

    # ==== Train Unsupervised HDP-LDA ====
    hdp_model = HdpModel(corpus=course_corpus, id2word=course_dictionary)

    # ==== Train Author Topic Model ====
    author_to_doc = {}  # author topic LDA (authors are modules,lessons,items)
    for author_type in ["modules", "lessons", "items"]:
        entity_to_doc = mapping[author_type]
        for entity_name, entity_docs in entity_to_doc.items():
            author_to_doc["{}: {}".format(author_type[0].capitalize(),
                                          entity_name)] = entity_docs
    at_model = AuthorTopicModel(corpus=course_corpus,
                                id2word=course_dictionary,
                                author2doc=author_to_doc)

    # ==== Train Labeled LDA ====
    # explicitly supervised, labeled LDA
    llda_alpha = 0.01
    llda_beta = 0.001
    llda_iterations = 50
    llda_labels = []
    llda_corpus = []
    labelset = set()
    for course_text_id in range(0, len(course_texts)):
        doc_labels = []
        # get module label name
        for module_name, doc_vec in mapping["modules"].items():
            if course_text_id in doc_vec:
                doc_labels.append("M: {}".format(module_name))
                break

        # get lesson label name
        for lesson_name, doc_vec in mapping["lessons"].items():
            if course_text_id in doc_vec:
                doc_labels.append("L: {}".format(lesson_name))
                break

        for item_name, doc_vec in mapping["items"].items():
            if course_text_id in doc_vec:
                doc_labels.append("I: {}".format(item_name))
                break

        llda_labels.append(doc_labels)
        llda_corpus.append(course_texts[course_text_id])
        labelset = labelset.union(doc_labels)

    llda_model = LLDA(llda_alpha, llda_beta, K=len(llda_labels))
    llda_model.set_corpus(llda_corpus, llda_labels)
    llda_model.train(iteration=llda_iterations)

    # phi = llda.phi()
    # for k, label in enumerate(labelset):
    #     print ("\n-- label %d : %s" % (k + 1, label))
    #     for w in argsort(-phi[k + 1])[:10]:
    #         print("%s: %.4f" % (llda.vocas[w], phi[k + 1,w]))
    return lda_model, hdp_model, at_model, llda_model, llda_labels
Beispiel #5
0
    def run_hdp(self, modelId, **kwargs):
        print(kwargs)
        hdpModel = HdpModel(self, self.dict, kwargs)
        hdpData = {modelId:
                   {'model':hdpModel,
                    'args':kwargs}}

        self.hdpModels.append(hdpData)
Beispiel #6
0
def test_hdp():
    """Trains a HDP model and tests the html outputs."""
    corpus, dictionary = get_corpus_dictionary()
    hdp = HdpModel(corpus, dictionary.id2token)

    data = gensim_models.prepare(hdp, corpus, dictionary)
    pyLDAvis.save_html(data, 'index_hdp.html')
    os.remove('index_hdp.html')
Beispiel #7
0
def train_hdp_model(corpus, dictionary, chunksize):
    print('HDP model')
    model = HdpModel(corpus=corpus, id2word=dictionary, chunksize=chunksize, random_state=config.SEED)
    # To get the topic words from the model
    topics = []
    for topic_id, topic in model.show_topics(num_topics=10, formatted=False):
        topic = [word for word, _ in topic]
        topics.append(topic)
    return model
Beispiel #8
0
    def topicsHDP(self, num_topics=-1, topn=20):
        # HdpModel(corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None)
        hdp = HdpModel(corpus=self.corpus, id2word=self.id2word)

        # show_topics(topics=20, topn=20, log=False, formatted=True)
        # Print the topN most probable words for topics number of topics. Set topics=-1 to print all topics.
        # Set formatted=True to return the topics as a list of strings, or False as lists of (weight, word) pairs.

        return hdp.show_topics(topics=num_topics, topn=topn, formatted=False)
Beispiel #9
0
def hierarchical_dirichlet_process(corpus, num_topics, id2word):
    ''' HIERARCHICAL DIRICHLET PROCESS
    # Advantage of HDP: fully unsupervised: can determine the ideal number of topics it needs through posterior inference
    '''
    print 'Hierarchical Dirichlet Process'
    hdp_model = HdpModel(corpus = corpus, id2word = id2word)
    hdp_model.show_topics()
    hdp_topic = hdp_model.show_topics(formatted = False)
    return hdp_model
Beispiel #10
0
    def build_hdp(self):
        """Builds an HDP model of the corpus.
        """

        print("building HDP model...")
        start = time.time()
        self.hdp = HdpModel(corpus=self.get_bows(), id2word=self.dict)
        end = time.time()
        print("HDP finished! {:.2f} seconds".format(end - start))
Beispiel #11
0
def hdp(corpus,dictionary,docs,score=False):
    print('Traiing for {} documents ......'.format(len(corpus)))
    hdpmodel = HdpModel(corpus = corpus,id2word = dictionary)
    if score:
        print('calculating coherence socre for {} documents ......'.format(len(docs)))
        coherence_model = CoherenceModel(model=hdpmodel, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        print('\nCoherence Score: ', coherence_score)
        return hdpmodel,coherence_score
    return hdpmodel
Beispiel #12
0
def create_hdp(num_topic, dictionary):
    print("__________________________Create HDP_________________________")
    corpus, dic = generate_corpus(dictionary)
    hdpmodel = HdpModel(corpus=corpus, id2word=dic)
    topics = hdpmodel.print_topics(num_topics=num_topic, num_words=7)
    # see list of topics
    for topic in topics:
        print(topic)

    return hdpmodel
Beispiel #13
0
    def get_topics(self, corpus, vocabulary, num_words=10):

        hdpmodel = HdpModel(corpus=corpus, id2word=vocabulary)
        # Docs say that if -1 all topics will be in result (ordered by significance). num_words is optional.
        # .print_topics(num_topics=20, num_words=10)
        # Docs are wrong. If you use -1 the list will be empty. So just don't specify the num_topics:
        topics = hdpmodel.show_topics(formatted=False,
                                      num_words=num_words,
                                      num_topics=-1)
        #print(hdpmodel.get_topics().shape)
        return topics
Beispiel #14
0
    def model_pcs(self,model_name,LDA_topic_num=None):
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if self.mashup_only:
            if self.strict_train:
                train_corpus = self.train_mashup_dow
            else:
                train_corpus = self.mashup_dow
        else:
            if self.strict_train:
                train_corpus = self.train_mashup_dow + self.train_api_dow
            else:
                train_corpus = self.mashup_dow + self.api_dow

        if model_name=='HDP':
            self.model = HdpModel(train_corpus, self.dct)
            self.num_topics = self.model.get_topics ().shape[0]
            print('num_topics',self.num_topics)
        elif model_name=='TF_IDF':
            self.model =TfidfModel (train_corpus)
            self.num_topics=len(self.dct)
        elif model_name=='LDA':
            if LDA_topic_num is None:
                self.model = LdaModel(train_corpus)
            else:
                self.model = LdaModel(train_corpus,num_topics=LDA_topic_num)
            self.num_topics = self.model.get_topics ().shape[0]

        else:
            raise ValueError('wrong gensim_model name!')

        # 使用模型处理文本,再转化为标准的np格式(每个topic上都有上)
        # print(self.mashup_dow)
        self.mashup_features=[self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature
        # print(self.mashup_features)
        print('self.mashup_features, num:', len(self.mashup_features))
        zero_num1 = sum([1 if len(mashup_feature)==0 else 0 for mashup_feature in self.mashup_features])
        print('zero_num1',zero_num1)
        for i in range(len(self.mashup_features)):
            if len(self.mashup_features[i])==0:
                print(self.mashup_dow[i])

        self.api_features = [self.model[api_info] for api_info in self.api_dow]
        # print('when model-pcs,len of mashup_features and api_features:{},{}'.format(len(mashup_features),len(api_features)))
        self._mashup_features=np.zeros((meta_data.mashup_num, self.num_topics))
        self._api_features = np.zeros((meta_data.api_num, self.num_topics))
        for i in range(meta_data.mashup_num): # 部分维度有值,需要转化成规范array
            for index,value in self.mashup_features[i]:
                self._mashup_features[i][index]=value
        for i in range(meta_data.api_num):
            for index,value in self.api_features[i]:
                self._api_features[i][index]=value
        return self._mashup_features, self._api_features
Beispiel #15
0
def train_topics(args):
    print(f"Arguments: {args}")

    nlp = spacy.load("en", disable=["parser", "ner"])

    files = args["text"]
    lines = extract_stories(files)

    def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        allowed_postags = set(allowed_postags)
        docs = nlp.pipe(texts)
        text_tokens = []
        for doc in docs:
            tokens = [
                token.lemma_ for token in doc if token.pos_ in allowed_postags
                and not token.is_punct and not token.is_stop
            ]
            text_tokens.append(tokens)
        return text_tokens

    docs = tozenize(lines, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    print("Preprocessed Docs")

    bigram = gensim.models.Phrases(docs, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[docs], threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    docs = make_bigrams(docs)
    docs = make_trigrams(docs)

    print("Create Dictionary")
    # Create Dictionary
    corpus_dict = corpora.Dictionary(docs)
    # Create Corpus
    texts = docs
    # Term Document Frequency
    corpus = [corpus_dict.doc2bow(text) for text in texts]

    print("Train Model")
    hdp = HdpModel(corpus, corpus_dict)

    print(hdp.print_topics(num_topics=50, num_words=20))

    hdp.save(args["target"])
Beispiel #16
0
    def stream_topic_model(self,
                           topic: Topic,
                           dictionary: corpora.Dictionary = None,
                           corpus: IndexedCorpus = None,
                           num_topics=20,
                           max_topics_per_doc=5):
        # load dictionary and corpus, if necessary
        if not dictionary:
            dictionary = self.load_dictionary()
            logger.warning(
                "the default dictionary was loaded from file. "
                "You should keep an instance in memory instead of calling this in a loop..."
            )
        if not corpus:
            corpus = JsonLinesCorpus(self.file_corpus)
            logger.warning(
                "the default corpus was loaded from file. You should provide a "
                "reduced corpus to increase performance (see corpus2corpus)")
        # build the model
        logger.info(
            "building a topic model with {} topics for {} documents in topic '{}'"
            .format(num_topics, len(corpus), topic.topic_id))
        t0 = time.time()
        if self.model == "lda":
            model = LdaMulticore(corpus,
                                 id2word=dictionary.id2token,
                                 num_topics=num_topics,
                                 passes=2,
                                 iterations=50,
                                 chunksize=2000,
                                 workers=self.n_threads)
        elif self.model == "hdp":
            # T = overall topic limit, K = max topics per document
            model = HdpModel(corpus,
                             id2word=dictionary.id2token,
                             T=num_topics,
                             K=max_topics_per_doc)
        else:
            raise ValueError("Unknown model identifier '{}'".format(
                self.model))
        t1 = time.time()

        # serialize
        logger.info(
            "building the model took {:.1f} s. Serializing model...".format(
                t1 - t0))
        output_path = self._get_model_path(topic)
        with util.open_by_ext(output_path, 'wb') as fp:
            pickle.dump(model, fp, protocol=4)
            logger.info(
                "model dump finished, took {:.1f} s".format(time.time() - t1))
Beispiel #17
0
    def hdpmodel(self, corpus_t, save=False, savename=None):
        """

        :param corpus_t:
        :param save:
        :param savename:
        :return:
        """
        print('using Hierarchical Dirichlet Process model...')
        hdpmodel = HdpModel(corpus=corpus_t, id2word=self.word_dict)
        if save:
            print('输出hdp模型到文件:{}'.format(savename))
            hdpmodel.save(savename)
        return hdpmodel
Beispiel #18
0
 def _new_model(self, X=None, y=None):
     return HdpModel(X,
                     max_chunks=self.max_chunks,
                     max_time=self.max_time,
                     chunksize=self.chunksize,
                     kappa=self.kappa,
                     tau=self.tau,
                     K=self.K,
                     T=self.T,
                     alpha=self.alpha,
                     gamma=self.gamma,
                     eta=self.eta,
                     scale=self.scale,
                     var_converge=self.var_converge,
                     outputdir=self.outputdir,
                     random_state=self.random_state)
Beispiel #19
0
def build_hdp_vec(docs, targets, dct=None, hdp=None):
    docs = [[str(o) for o in one] for one in docs]

    if dct is None:  # train set
        dct = Dictionary(docs)
        for one in docs:
            dct.add_documents([[str(o) for o in one]])

    copus = [dct.doc2bow(o) for o in docs]
    if hdp is None:  # train
        hdp = HdpModel(copus, dct)

    v = [hdp[o] for o in copus]
    v_d = matutils.corpus2dense(v, num_terms=len(dct.token2id)).T

    return copus, v_d, targets, dct, hdp
Beispiel #20
0
def train_and_save_gensim_model(model_type_str,
                                corpus,
                                dct,
                                file_name='model_300.model',
                                num_topics=None):
    if model_type_str == "lsi":
        model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dct)
    elif model_type_str == "lda":
        model = LdaModel(corpus=corpus,
                         alpha='auto',
                         num_topics=num_topics,
                         id2word=dct)
    elif model_type_str == "hdp":
        model = HdpModel(corpus=corpus, id2word=dct)
    model.save(file_name)
    return model
Beispiel #21
0
def get_hdp_model(doc_term_matrix, id2word, fname):
    if params['training']:
        hdp_model = HdpModel(
            corpus=doc_term_matrix,
            id2word=id2word,
            max_chunks=10000,
            chunksize=2000,
            kappa=0.6,
            tau=32.0,
            eta=0.05,
        )
        _save_model('hdp', hdp_model, fname=fname)
    else:
        hdp_model = _load_model('hdp', fname)

    return hdp_model
Beispiel #22
0
def run_model(corpus, dictionary, method='LDA', num_topics=8):
    if method == 'LDA':
        ldamodel = LdaModel(corpus=corpus,
                            num_topics=num_topics,
                            id2word=dictionary,
                            iterations=400,
                            passes=20)
        return ldamodel
    elif method == 'LSI':
        lsimodel = LsiModel(corpus=corpus,
                            num_topics=num_topics,
                            id2word=dictionary)
        return lsimodel
    elif method == 'HDP':  # extension of LDA when # of topic is unknown
        hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
        return hdpmodel
Beispiel #23
0
def topic_modeling(paratext, typel='hdp', numtopics=20, npasses=100):
    doc_clean = [clean(doc).split() for doc in paratext]
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    #
    if (typel == 'hdp'):
        lda_model = HdpModel(corpus=doc_term_matrix, id2word=dictionary)
    elif (typel == 'lda'):
        lda_model = gensim.models.ldamodel.LdaModel(doc_term_matrix,
                                                    num_topics=numtopics,
                                                    id2word=dictionary,
                                                    passes=npasses,
                                                    minimum_probability=0.0)
    else:
        print('invalid option in topic_modeling()')
        exit(1)
    return lda_model, dictionary, doc_term_matrix
Beispiel #24
0
    def runModels(self, number_of_topics, corpus, dictionary, start, end):

        #do hdp model

        hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

        hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10)
        hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(hdptopics)

        #add results to total kept in a list
        #   addToResults(result_dict)

        #output results
        self.printResults(number_of_topics, hdptopics, 'hdp', start, end)

        #d lda model
        ldamodel = LdaModel(corpus=corpus,
                            num_topics=number_of_topics,
                            id2word=dictionary,
                            random_state=100,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True)

        ldamodel.save('lda' + number_of_topics + '.model')
        ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(ldatopics)
        #   addToResults(result_dict)
        self.printResults(number_of_topics, ldatopics, 'lda', start, end)

        visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

        location = os.path.join(pn, 'topic_model_results')

        #visualize outputs in html
        pyLDAvis.save_html(
            visualisation,
            os.path.join(
                location, 'LDA_Visualization' + str(number_of_topics) + "_" +
                start + "_" + end + '.html'))
Beispiel #25
0
 def gensimTopicModelingAnalysis(self, n):
     files = glob.glob(
         "/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/*.txt")
     files = sorted(
         files,
         key=lambda x: int(
             x.split(
                 '/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/Cluster'
             )[1].split('_')[0]))
     with open("/Users/advaitbalaji/Desktop/ListofSortedClusters.txt",
               "w") as of:
         for f in files:
             of.writelines(f + "\n")
     texts, clusters = n.readMultipleFileLineWise(files)
     dictionary = Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
     print(hdpmodel.show_topics())
Beispiel #26
0
    def model_pcs(self, model_name, LDA_topic_num=None):
        # 模型处理,返回mashup和api的特征:对同一个语料,可以先后使用不同的模型处理
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if self.mashup_only:
            if self.strict_train:
                train_corpus = self.train_mashup_dow
            else:
                train_corpus = self.mashup_dow
        else:
            if self.strict_train:
                train_corpus = self.train_mashup_dow + self.api_dow
            else:
                train_corpus = self.mashup_dow + self.api_dow

        if model_name == 'HDP':
            self.model = HdpModel(train_corpus, self.dct)
            self.num_topics = self.model.get_topics().shape[0]
            print('num_topics', self.num_topics)
        elif model_name == 'TF_IDF':
            self.model = TfidfModel(train_corpus)
            self.num_topics = len(self.dct)
        elif model_name == 'LDA':
            if LDA_topic_num is None:
                self.model = LdaModel(train_corpus)
            else:
                self.model = LdaModel(train_corpus, num_topics=LDA_topic_num)
            self.num_topics = self.model.get_topics().shape[0]
        else:
            raise ValueError('wrong gensim_model name!')

        # 使用模型处理文本得到稀疏特征向量,再转化为标准的np格式(每个topic上都有)
        # *** 由于mashup_dow和api_dow默认是全部mashup/api的文本,所以得到的特征列表用全局性的id索引即可 ***
        self.mashup_features = [self.model[mashup_info] for mashup_info in self.mashup_dow]  # 每个mashup和api的feature
        self.api_features = [self.model[api_info] for api_info in self.api_dow]
        self.dense_mashup_features = np.zeros((data_repository.get_md().mashup_num, self.num_topics))
        self.dense_api_features = np.zeros((data_repository.get_md().api_num, self.num_topics))
        for i in range(data_repository.get_md().mashup_num):  # 部分维度有值,需要转化成规范array
            for index, value in self.mashup_features[i]:
                self.dense_mashup_features[i][index] = value
        for i in range(data_repository.get_md().api_num):
            for index, value in self.api_features[i]:
                self.dense_api_features[i][index] = value
        return self.dense_mashup_features, self.dense_api_features
def comparison(texts):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LSI Model output')
    print(lsimodel.show_topics())

    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
    print('hdp model output')
    print(hdpmodel.show_topics())

    ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LDA Model output')
    print(ldamodel.show_topics())


    pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

    lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]

    hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

    lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence()

    def evaluate_bar_graph(coherences, indices):
        assert len(coherences) == len(indices)
        n = len(coherences)
        x = np.arange(n)
        plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
        plt.xlabel('Models')
        plt.ylabel('Coherence Value')
        plt.show()

    evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
Beispiel #28
0
    def createHDP(self, fileName='', modelName=''):
        '''
        fileName -> file for the dictionary (.dict) and corpus (.mm) files 
        modelName -> model name for LDA to save to disk
        ldaPasses -~ number of passes, 10 default
        topicNum -> number of topics to generate, 100 by default
        '''
        if fileName == '':
            fileName = self.__fileName

        if modelName == '':
            modelName = self.__fileName

        dict = corpora.Dictionary.load(self.__destination + fileName + '.dict')
        mm = corpora.MmCorpus(self.__destination + fileName + '.mm')

        hdp = HdpModel(corpus=mm, id2word=dict)
        hdp.save(self.__destination + modelName + '.hdp')
        print hdp
        print 'Created HDP model %s' % self.__fileName
Beispiel #29
0
def extract_topic_model(corpus, dictionary, model, num_topics):
    """
    Extract topic model
    """
    if model == 'lsi':
        lsimodel = LsiModel(corpus=corpus,
                            num_topics=num_topics,
                            id2word=dictionary)
        return lsimodel

    elif model == 'lda':
        ldamodel = LdaModel(corpus = corpus, id2word=dictionary, \
                            alpha='auto', eta='auto', \
                            iterations=800, num_topics= num_topics, \
                            passes = 20, eval_every= None)
        return ldamodel

    elif model == 'hdp':
        hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
        return hdpmodel
Beispiel #30
0
    def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True,
              model='lda'):
        """
        Trian the topic cluster model.
        Input value: data: pd.DataFrame format ['id','title','content','summary']
                     num_topics: (int) the number of topics
                     iterations: (int) total number of iteration times
        example:
        >>> lda = LDA_Model
        >>> lda.train(text)
        """
        data = load_data(str(path + '/output/data.csv'))
        self.original_data = data
        self.text = list(data['content'])
        self.num_topics = num_topics
        self.iterations = iterations
        self.model_name = model

        print('preprocessing...')
        self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words)

        self.id2word = Dictionary(self.token)
        self.corpus = [self.id2word.doc2bow(text) for text in self.token]
        if tfidf == True:
            print('calculate tfidf...')
            tfidf_model = TfidfModel(self.corpus)
            self.corpus = tfidf_model[self.corpus]

        if model == 'lda':
            self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics,
                                  iterations=self.iterations)
        if model == 'lsi':
            self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics)
        if model == 'hdp':
            self.model = HdpModel(corpus=self.corpus, id2word=self.id2word)
            self.num_topics = self.model.get_topics().shape[0]

        self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words'])
        self.doc_topic = self._doc_topic()
        self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id'])
        self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])