Ejemplo n.º 1
0
    def partial_fit(self, X):
        """Train model over a potentially incomplete set of documents.

        Uses the parameters set in the constructor.
        This method can be used in two ways:
        * On an unfitted model in which case the model is initialized and trained on `X`.
        * On an already fitted model in which case the model is **updated** by `X`.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            A collection of documents in BOW format used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.hdp.HdpTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)

        if self.gensim_model is None:
            self.gensim_model = models.HdpModel(
                id2word=self.id2word, max_chunks=self.max_chunks,
                max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau,
                K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale,
                var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state
            )

        self.gensim_model.update(corpus=X)
        return self
Ejemplo n.º 2
0
    def fit(self, X, y=None):
        """
        Fit the model according to the given training data.
        Calls gensim.models.HdpModel
        """
        if sparse.issparse(X):
            corpus = matutils.Sparse2Corpus(X)
        else:
            corpus = X

        self.gensim_model = models.HdpModel(corpus=corpus,
                                            id2word=self.id2word,
                                            max_chunks=self.max_chunks,
                                            max_time=self.max_time,
                                            chunksize=self.chunksize,
                                            kappa=self.kappa,
                                            tau=self.tau,
                                            K=self.K,
                                            T=self.T,
                                            alpha=self.alpha,
                                            gamma=self.gamma,
                                            eta=self.eta,
                                            scale=self.scale,
                                            var_converge=self.var_converge,
                                            outputdir=self.outputdir,
                                            random_state=self.random_state)
        return self
Ejemplo n.º 3
0
def main_new_dataset():

    newData = pd.read_csv(
        '../xsense_data/global_dataset_abs_speed_diff_yaw.txt', sep=';')
    newDataToWord = newData.ix[:, [
        'Acc_X', 'Acc_Y', 'Speed_X', 'Speed_Y', 'Diff_Yaw'
    ]]

    worder = WordData(newDataToWord)
    words = worder.create_words(worder.dataset)
    colWords = pd.Series(words, name='Word')
    wordDataset = pd.concat([newData, colWords], axis=1)
    #wordDataset.to_csv('../xsense_data/word_global_dataset.txt',sep=';')

    docs = worder.create_text_corpus(wordDataset)

    texts = [[i for i in doc.lower().split()] for doc in docs]

    dictionary = corpora.Dictionary(texts)
    dictionary.save('data_topic_modeling/new_dataset/doc_dictionary.dict')
    # corpus = corpora.TextCorpus(docs)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('data_topic_modeling/new_dataset/documents.mm',
                               corpus)
    hdp = models.HdpModel(corpus, dictionary, T=50, K=10)
    print hdp.show_topics(topics=20, topn=5)

    topicDocs = hdp[corpus]
    for x in topicDocs:
        print x
Ejemplo n.º 4
0
def query_similarity(queries, corpus, method='tfidf', n_neighbors=2):
    dictionary, corpusdic = build_corpusdic(corpus)
    if method == 'lsi':
        mdl = models.LsiModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'tfidf':
        mdl = models.TfidfModel(corpusdic)
    elif method == 'rp':
        mdl = models.RpModel(corpusdic, num_topics=100)
    elif method == 'hdp':
        mdl = models.HdpModel(corpusdic, id2word=dictionary)
    elif method == 'lda':
        mdl = models.LdaModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'lem':
        mdl = models.LogEntropyModel(corpusdic)
    elif method == 'norm':
        mdl = models.NormModel(corpus, norm='l2')

    else:
        raise ValueError("There is an invalid model method in the input!")
    index = similarities.MatrixSimilarity(mdl[corpusdic])
    indx_list = []
    sim_list = []
    for query in queries:
        vec_bow = dictionary.doc2bow(query.lower().split())
        vec_lsi = mdl[vec_bow]  # convert the query to LSI space
        sims = index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        sims = sims[:n_neighbors]
        indx_, sim_ = np.array(sims).transpose()
        indx_list.append(indx_)
        sim_list.append(sim_)
    return indx_list, sim_list
Ejemplo n.º 5
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            A collection of documents in BOW format used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.hdp.HdpTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
        else:
            corpus = X

        self.gensim_model = models.HdpModel(
            corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks,
            max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau,
            K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale,
            var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state
        )
        return self
Ejemplo n.º 6
0
    def HDP(self, print_params=True, 
                  save_model=True,
                  save_dir='saved_models',
                  filename='',
                  **kwargs):
        '''
        Estimate a 'good' number of topics to set, based on the data
        '''

        hdp_model = models.HdpModel(self.bow, 
                                    id2word=self.gensim_dict,
                                    **kwargs)

        print('Inferring number of topics with Hierarchical Dirichlet Process...\n')

        if print_params:
            print('Parameters used in model:')
            print('TFIDF transformation: {}\n'.format(self.tfidf))

        if save_model:  
            if len(filename) == 0:
                filename = 'HDP_Params_TFIDF{}_'.format(self.tfidf)                                                      
            
            full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='HDP')
            hdp_model.save(full_path) 
            print('Saving HDP model to: \n{}\n'.format(full_path))  

        return hdp_model
Ejemplo n.º 7
0
def create_model(corpus_path,
                 output_path,
                 num_topics=500,
                 tfidf=False,
                 lda=False,
                 lsi=False,
                 hdp=False):
    """Creates a model(s) specify by the parameters and save to output directory

    Parameters:
        corpus_path: the path to the corpus directory (os.path)
        output_path: the directory path where model(s) will be saved (os.path)
        tfidf=False: True if want a tfidf model created (boolean)
        lda=False: True if want a lda model created (boolean)
        lsi=False: True if want a lsi model created (boolean)
    """
    mc = MathCorpus(corpus_path)
    mc.save_dictionary(os.path.join(output_path, "corpus.dict"))
    corpora.MmCorpus.serialize(os.path.join(output_path, "corpus.mm"), mc)
    tfidf_model = models.TfidfModel(mc)
    if tfidf:
        tfidf_model.save(os.path.join(output_path, "model.tfidf"))
    if lda:
        lda_model = models.LdaModel(mc,
                                    id2word=mc.dictionary,
                                    num_topics=num_topics)
        lda_model.save(os.path.join(output_path, "model.lda"))
    if lsi:
        lsi_model = models.LsiModel(tfidf_model[mc],
                                    id2word=mc.dictionary,
                                    num_topics=num_topics)
        lsi_model.save(os.path.join(output_path, "model.lsi"))
    if hdp:
        hdi_model = models.HdpModel(mc, id2word=mc.dictionary)
        hdi_model.save(os.path.join(output_path, "model.hdp"))
Ejemplo n.º 8
0
    def partial_fit(self, X):
        """
        Train model over X.
        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(X)

        if self.gensim_model is None:
            self.gensim_model = models.HdpModel(id2word=self.id2word,
                                                max_chunks=self.max_chunks,
                                                max_time=self.max_time,
                                                chunksize=self.chunksize,
                                                kappa=self.kappa,
                                                tau=self.tau,
                                                K=self.K,
                                                T=self.T,
                                                alpha=self.alpha,
                                                gamma=self.gamma,
                                                eta=self.eta,
                                                scale=self.scale,
                                                var_converge=self.var_converge,
                                                outputdir=self.outputdir,
                                                random_state=self.random_state)

        self.gensim_model.update(corpus=X)
        return self
Ejemplo n.º 9
0
 def get_hdp(self):
     docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
     model_hdp = models.HdpModel(docs_corpus, id2word=self.docs_dict)
     docs_hdp = model_hdp[docs_corpus]
     docs_vecs = np.vstack(
         [sparse2full(c, len(self.docs_dict)) for c in docs_hdp])
     return docs_vecs
Ejemplo n.º 10
0
    def fit(self, training, training_info):
        # store training sets
        self.training = training
        self.training_info = training_info

        print("creating train tokens")
        train_tokens = training_info["tokens"].apply(
            lambda tokens: tokens.split(" ")).values.tolist()
        print("creating train dict")
        train_my_dict = dictionary.Dictionary(train_tokens)
        print("creating train corpus")
        train_corpus = [train_my_dict.doc2bow(token) for token in train_tokens]
        print("training Hdp model")
        if os.path.isfile('temp/model.hdp') and self.use_pretrained_model:
            self.hdp = models.HdpModel.load('temp/model.hdp')
        else:
            self.hdp = models.HdpModel(train_corpus, id2word=train_my_dict)
            self.hdp.save('temp/model.hdp')
        print("creating train Hdp matrix")
        self.hdp_train_matrix = np.array(
            [self.hdp[document] for document in train_corpus])

        self.address_books = create_address_books(training, training_info)
        self.mids_sender_recipient = create_dictionary_mids(
            training, training_info)
Ejemplo n.º 11
0
def gensim_feature(corpus=None):

    # corpus参数样例数据如下:
    corpus = [["我", "来到", "成都", "春熙路"],
              ["今天", "在", "宽窄巷子", "耍", "了", "一天"],
              ["成都", "整体", "来说", "还是", "挺", "安逸", "的"],
              ["成都", "的", "美食", "真", "巴适", "惨", "了"]]
    dictionary = corpora.Dictionary(corpus)  # 构建语料词典

    # # 收集停用词和仅出现一次的词的id
    # stop_ids = [dictionary.token2id[stopword] for stopword in user_stop_word_list if stopword in dictionary.token2id]
    # once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
    # dictionary.filter_tokens(stop_ids + once_ids) # 删除停用词和仅出现一次的词
    # dictionary.compactify()  # 消除id序列在删除词后产生的不连续的缺口
    # dictionary.save('mycorpus.dict')  # 把字典保存起来,方便以后使用

    # 统计词频特征
    dfs = dictionary.dfs  # 词频词典
    for key_id, c in dfs.items():
        print(dictionary[key_id], c)

    # 转换成doc_bow
    doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus]

    # 生成tfidf特征
    tfidf_model = models.TfidfModel(dictionary=dictionary)  # 生成tfidf模型
    tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus]  # 将每doc_bow转换成对应的tfidf_doc向量

    # 生成lsi特征(潜在语义索引)
    lsi_model = models.LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100)  # 生成lsi model
    # 生成corpus of lsi
    lsi_corpus = [lsi_model[tfidf_doc] for tfidf_doc in tfidf_corpus]  # 转换成lsi向量

    # 生成lda特征(主题模型)
    lda_model = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100)  # 生成lda model
    # 生成corpus of lsi
    lda_corpus = [lda_model[tfidf_doc] for tfidf_doc in tfidf_corpus]  # 转换成lda向量

    # 生成随机映射(Random Projections,RP, 优点:减小空维度、CPU和内存都很友好)
    rp_model = models.RpModel(tfidf_corpus, num_topics=500)
    rp_corpus = [rp_model[tfidf_doc] for tfidf_doc in tfidf_corpus]  # 转换成随机映射tfidf向量

    # 分层狄利克雷过程(Hierarchical Dirichlet Process,HDP ,一种无参数贝叶斯方法)
    hdp_model = models.HdpModel(doc_bow_corpus, id2word=dictionary)
    hdp_corpus = [hdp_model[doc_bow] for doc_bow in doc_bow_corpus]  # 转换成HDP向量

    # 文档向量和词向量 (Doc2Vec and Word2Vec)
    tld_list = []
    for ind, line_list in enumerate(corpus):
        tld_list.append(TaggedDocument(line_list, tags=[str(ind)]))
    d2v_model = Doc2Vec(tld_list, min_count=2, window=3, size=100, sample=1e-3, negative=5,iter=15)
    # 由于Doc2vec的训练过程也可以同时训练Word2vec,所以可以直接获取两个模型,全部保存起来:
    # model.save(save_model_d2v_file_path)
    # model.save_word2vec_format(save_model_w2v_file_path, binary=True)

    # 将文本转换成向量矩阵
    docvecs = d2v_model.docvecs
    docvecs_matrix = np.asarray(docvecs)
    print(docvecs_matrix.shape)
Ejemplo n.º 12
0
 def get_model(dictionary, corpus):
     """
     Method returns trained topic-modelling model (Hierarchical Dierichlet Process).
     It requires gensim objects (dictionary and corpus)
     :param dictionary: gensim object
     :param corpus: gensim object
     :return: model object
     """
     return models.HdpModel(corpus, id2word=dictionary)
Ejemplo n.º 13
0
 def train(self):
     """
     Train the HDP model
     :param n_passes: number of training passes
     :param update_every: training batch size
     :return: trained model
     """
     hdp_model = models.HdpModel(self.corpus, id2word=self.dictionary)
     return hdp_model
Ejemplo n.º 14
0
 def init_HDP(self, tf_idf='No'):
     if tf_idf == 'Yes':
         corpus, BOW_user_queries = self.init_tfidf()
     else:
         corpus, BOW_user_queries = self.get_corpus()
     HDP = models.HdpModel(corpus, id2word=self.dictionary)
     #print(LDA.show_topics())
     corpus_HDP = HDP[corpus]
     HDP_user_queries = HDP[BOW_user_queries]
     return corpus_HDP, HDP_user_queries
Ejemplo n.º 15
0
def create_model(settings, model_type, bow_corpus, dictionary):
    print(f"Training {model_type} model. This may take several minutes depending on the size of the corpus.")
    model = None
    if model_type == 'LDA':
        model = models.LdaModel(bow_corpus, num_topics=settings['numberTopics'], id2word=dictionary, minimum_probability=settings['minimumProbability'])
    elif model_type == 'HDP':
        model = models.HdpModel(bow_corpus, dictionary)
    else:
        print('Invalid model')
        return
    save_model(settings['datasetName'], model, model_type)
    return model
Ejemplo n.º 16
0
def main():

    newData = pd.read_csv('../xsense_data/global_dataset.txt', sep=';')

    ###############################LONG WORD TRY ###############################
    ############################### 15 SIGNALS   ###############################
    ## Choose feature to represent in words
    ## All exclused altitude
    ## dataPartOne = newData.ix[:,'Acc_X':'Pitch']
    ## dataPartTwo = newData.ix[:, 'Speed_X':'Speed_Z']

    ## newDataToWord = pd.concat([dataPartOne,dataPartTwo], axis=1)
    ###############################REDUCED WORD TRY ###############################
    ############################### 5 SIGNALS       ###############################
    newDataToWord = newData.ix[:,
                               ['Acc_X', 'Acc_Y', 'Acc_Z', 'Speed_X', 'Roll']]

    worder = WordData(newDataToWord)
    words = worder.create_words(worder.dataset)

    colWords = pd.Series(words, name='Word')
    wordDataset = pd.concat([newData, colWords], axis=1)
    wordDataset.to_csv('../xsense_data/word_global_dataset.txt', sep=';')

    docs = worder.create_text_corpus(wordDataset)

    #docs = ['aaabacdb abababdb addbaedb daecabdb badbccdb',
    #		'aeaaacdb abebabdb acdbaedc dbecadda addbbccb',
    #		'aeaaacdb abebabdb acdbaedc dbecadda addbbccb']

    texts = [[i for i in doc.lower().split()] for doc in docs]

    dictionary = corpora.Dictionary(texts)
    dictionary.save('data_topic_modeling/doc_dictionary.dict')
    # corpus = corpora.TextCorpus(docs)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('data_topic_modeling/documents.mm', corpus)
    hdp = models.HdpModel(corpus, dictionary, T=50, K=10)
    print hdp.show_topics(topics=20, topn=5)

    topicDocs = hdp[corpus]
    for x in topicDocs:
        print x

    alpha, beta = hdp.hdp_to_lda()
    print alpha
    lda_model = models.LdaModel(id2word=hdp.id2word,
                                num_topics=len(alpha),
                                alpha=alpha,
                                eta=hdp.m_eta)
    lda_model.expElogbeta = np.array(beta, dtype=np.float32)
    print lda_model.show_topic(1)
Ejemplo n.º 17
0
 def HDP(self, **config):
     gamma = config['gamma']
     kappa = config['kappa']
     tau = config['tau']
     K = config['K']
     T = config['T']
     eta = config['eta']
     self.model = models.HdpModel(self._dictionary.corpus,
                                  id2word=self._dictionary,
                                  gamma=gamma,
                                  kappa=kappa,
                                  tau=tau,
                                  K=K,
                                  T=T,
                                  eta=eta)
Ejemplo n.º 18
0
    def transform(self, model='lda', ntopics=1, num_passes=1):
        #tfidf = models.TfidfModel(corpus=self.bag_of_word)
        #corpus_tfidf = tfidf[self.bag_of_word]
        ##return corpus_tfidf

        if model == 'lda':
            return models.LdaModel(self.bag_of_word,
                                   num_topics=ntopics,
                                   id2word=self.__dictionary,
                                   passes=num_passes,
                                   chunksize=10000,
                                   update_every=0,
                                   distributed=True)
        elif model == 'hdp':
            return models.HdpModel(self.bag_of_word, id2word=self.__dictionary)
Ejemplo n.º 19
0
def calculate_topic_distribution():
    manage_nyt_dataset.topicsecription.remove({})
    #dictionary = corpora.Dictionary.load('tmp/dictionary.dict')
    #corp = corpora.BleiCorpus('tmp/corpus_nyt.hdp-c')
    corpus, dictionary = create_dictionary()
    hdpmodel = models.HdpModel(corpus, id2word=dictionary)

    hdpmodel_corp = hdpmodel[corpus]
    create_topic_document_distribution(hdpmodel_corp)

    for k in hdpmodel.show_topics(topn=10, topics=-1, formatted=False):
        manage_nyt_dataset.topicsecription.insert_one({
            'topic': k[0],
            'tuple_terms': k[1]
        })
    return
Ejemplo n.º 20
0
def buildCorpus():
    from gensim import corpora, models, similarities
    import logging
    from getDocSparseVector import getDocumentCorpus, cleanAndTokenize
    import cPickle as pickle

    directory = "/Users/Larry/Code/EpistemicAssistant/sampleWordDocs/"
    #Imports a set of comparison documents and tokenizes them
    #Should not need to rebuild the corpus at each request...
    documents = getDocumentCorpus(directory)  #Get document objects
    texts = []
    for doc in documents:
        texts.append(doc.tokenizedText)

    documentDictionary = corpora.Dictionary(texts)
    corpus = [documentDictionary.doc2bow(text) for text in texts]
    #Computes the HDA/nonparametric topic models
    if 'hdp' in locals():
        print 'HDP already built. Using existing model'
    else:
        hdp = models.HdpModel(corpus, id2word=documentDictionary)

    pickle.dump(
        corpus,
        open(
            "/Users/Larry/Code/EpistemicAssistant/relevanceComputations/corpus.p",
            "wb"))  #Save corpus

    pickle.dump(
        documentDictionary,
        open(
            "/Users/Larry/Code/EpistemicAssistant/relevanceComputations/documentDictionary.p",
            "wb"))  #Save documentDictionary

    pickle.dump(
        hdp,
        open(
            "/Users/Larry/Code/EpistemicAssistant/relevanceComputations/hdp.p",
            "wb"))  #Save Hdp

    pickle.dump(
        documents,
        open(
            "/Users/Larry/Code/EpistemicAssistant/relevanceComputations/documents.p",
            "wb"))  #Save documents
Ejemplo n.º 21
0
def LDA_LSI_hda_code(texts):
    dictionary = corpora.Dictionary(texts)
    print (dictionary)
    V = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpus_tfidf = models.TfidfModel(corpus)[corpus]
    corpus_tfidf = corpus

    print ('TF-IDF:')
    for c in corpus_tfidf:
        print (c)

    print ('\nLSI Model:')
    lsi = models.LsiModel(corpus_tfidf, num_topics=20, id2word=dictionary)
    topic_result = [a for a in lsi[corpus_tfidf]]
    pprint(topic_result)
    print ('LSI Topics:')
    pprint(lsi.print_topics(num_topics=20, num_words=10))
    similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf])   # similarities.Similarity()
    print ('Similarity:')
    pprint(list(similarity))

    print ('\nLDA Model:')
    num_topics = 2
    lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
                          alpha='auto', eta='auto', minimum_probability=0.001, passes=10)
    doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
    print ('Document-Topic:\n')
    pprint(doc_topic)
    for doc_topic in lda.get_document_topics(corpus_tfidf):
        print (doc_topic)
    for topic_id in range(num_topics):
        print ('Topic', topic_id)
        # pprint(lda.get_topic_terms(topicid=topic_id))
        pprint(lda.show_topic(topic_id))
    similarity = similarities.MatrixSimilarity(lda[corpus_tfidf])
    print ('Similarity:')
    pprint(list(similarity))

    hda = models.HdpModel(corpus_tfidf, id2word=dictionary)
    topic_result = [a for a in hda[corpus_tfidf]]
    print ('\n\nUSE WITH CARE--\nHDA Model:')
    pprint(topic_result)
    print ('HDA Topics:')
    print (hda.print_topics(num_topics=20, num_words=10))
Ejemplo n.º 22
0
def startNLP(modelType):
    #This builds the corpus and the model, etc. It is also possible to use these things prebuilt
    from gensim import corpora, models
    import logging
    from getDocSparseVector import getDocumentCorpus
    #Declare globals
    #global documents, corpus, documentDictionary, hdp

    #reload(getDocSparseVector)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    #Use heirarchical dirichlet allocation topic modeling from gensim to compute the relevance between documents

    directory = "/Users/Larry/Code/EpistemicAssistant/sampleWordDocs/"
    #Imports a set of comparison documents and tokenizes them
    #Should not need to rebuild the corpus at each request...
    documents = getDocumentCorpus(directory)  #Get document objects
    texts = []
    for doc in documents:
        texts.append(doc.tokenizedText)

    documentDictionary = corpora.Dictionary(texts)
    corpus = [documentDictionary.doc2bow(text) for text in texts]

    #Computes the HDA/nonparametric topic models
    if modelType == 'hdp':
        currentModel = models.HdpModel(corpus, id2word=documentDictionary)
    elif modelType == 'tfidf':
        #hdp = models.HdpModel(corpus, id2word=documentDictionary)
        currentModel = models.TfidfModel(corpus, id2word=documentDictionary)
    elif modelType == 'lda':
        currentModel = models.LdaModel(
            corpus, id2word=documentDictionary,
            num_topics=200)  #Should try to fugure out a good number of topics
    else:
        print(currentModel + ' not yet supported')

    return {
        'documents': documents,
        'corpus': corpus,
        "documentDictionary": documentDictionary,
        "currentModel": currentModel
    }
Ejemplo n.º 23
0
def gensim_Corpus(corpus=None):
    dictionary = corpora.Dictionary(corpus)
    # 1 doc_bow转化为tfidf向量
    doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus]
    tfidf_model = models.TfidfModel(dictionary=dictionary)
    tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus]
    print('doc_bow转换成对应的tfidf_doc向量:\n', tfidf_corpus)

    # 2 分层狄利克雷过程(Hierarchical Dirichlet Process,HDP ,一种无参数贝叶斯方法)
    hdp_model = models.HdpModel(doc_bow_corpus, id2word=dictionary)
    hdp_corpus = [hdp_model[doc_bow] for doc_bow in doc_bow_corpus]  # 转换成HDP向量
    print('HDP :\n', hdp_corpus)

    # 3 将RP模型存储到磁盘上
    savepath = r'../dataSet/files/hdp_model.pkl'
    hdp_file = open(savepath, 'wb')
    pkl.dump(hdp_model, hdp_file)
    hdp_file.close()
    print('--- HDP模型已经生成 ---')
Ejemplo n.º 24
0
def topics(documents,
           dictionary,
           strategy='lda',
           num_topics=3,
           iterations=50,
           passes=1,
           **kwargs):
    """
    Strategies and best practices are:
    "lsi" - latent semantic indexing. Documents = tfidf_corpus. Num is 200-500 topics.
    "lda" - latent dirichlet analyisis. Documents = corpus. Num is expert driven.
    "rp" - Random projections. Documents = tfidf_corpus, Num is 100-10000
    "hdp" - Hierarchical Dirichlet Process = corpus. Num is not used.
    """
    if strategy == "lsi":
        model = models.LsiModel(documents,
                                id2word=dictionary,
                                num_topics=num_topics,
                                iterations=iterations,
                                passes=passes,
                                **kwargs)

    if strategy == "lda":
        model = models.LdaModel(documents,
                                id2word=dictionary,
                                num_topics=num_topics,
                                iterations=iterations,
                                passes=passes,
                                **kwargs)

    if strategy == "rp":
        model = models.RpModel(documents,
                               num_topics=num_topics,
                               iterations=iterations,
                               passes=passes,
                               **kwargs)

    if strategy == "hdp":
        model = models.HdpModel(documents, id2word=dictionary, **kwargs)
    results = model[documents]
    return model, results
Ejemplo n.º 25
0
    def _create_models(self):
        print("Create models ...")

        if self._model_type == "lda":
            self._topic_model = models.LdaModel(self._corpus,
                                                id2word=self._wdict,
                                                num_topics=self._num_topics)
        elif self._model_type == "lsi":
            self._topic_model = models.LsiModel(self._corpus,
                                                id2word=self._wdict,
                                                num_topics=self._num_topics)
        elif self._model_type == "hdp":
            self._topic_model = models.HdpModel(self._corpus,
                                                id2word=self._wdict)
        elif self._model_type == "none":
            self._topic_model = NullModel(self._corpus, id2word=self._wdict)
        else:
            raise SyntaxError("Invalid model_type '%s'" % self._model_type)

        self._index = similarities.MatrixSimilarity(
            self._topic_model[self._corpus])
Ejemplo n.º 26
0
def train_hdp_model(corpus, num_topics, id2word):
    print('Training HDP model...')

    hdp_output = open('models/hdp.txt', 'w')

    hdp = models.HdpModel(corpus_tfidf, id2word=dictionary)
    topic_result = [a for a in hdp[corpus_tfidf]]

    print('HDP Model:', file=hdp_output)
    pprint(topic_result, stream=hdp_output)

    print('\nHDP Topics:', file=hdp_output)
    print(hdp.print_topics(num_topics=num_topics, num_words=5),
          file=hdp_output)

    print('Visualizing HDP similarity...')
    similarity = list(similarities.MatrixSimilarity(hdp[corpus_tfidf]))
    print('\nSimilarity:', file=hdp_output)
    pprint(similarity, stream=hdp_output)
    draw_graph(similarity, 0.99, 'visualization/hdp_similarity.png')
    return similarity
Ejemplo n.º 27
0
def build_model(dataset, num_topics=100, is_hdp=True):
    print("generating dictionary and corpus...")
    dic = corpora.Dictionary(dataset)
    dic.filter_extremes(no_below=2)  # 去除低频词汇
    corpus = [dic.doc2bow(text) for text in dataset]
    print("constructing LDA model...")
    if is_hdp:
        hdp = models.HdpModel(corpus, id2word=dic)
        (alpha, beta) = hdp.hdp_to_lda()
        model = models.LdaModel(id2word=hdp.id2word,
                                num_topics=len(alpha),
                                alpha=alpha,
                                eta=hdp.m_eta)
        model.expElogbeta = np.array(beta, dtype=np.float32)
        num_topics = len(alpha)
    else:
        model = models.LdaMulticore(corpus, id2word=dic, num_topics=num_topics)
    print("saving model...")
    dic.save_as_text("topic_model/dic.txt")
    corpora.MmCorpus.serialize("topic_model/corpus.mm", corpus)
    model.save('topic_model/model.lda')

    return model, num_topics
Ejemplo n.º 28
0
    def get_hdp(self):
        hdp = models.HdpModel(self.gs_corpus, self.gs_dict)
        hdp_topics = hdp.get_topics()
        hdp_df = pd.DataFrame(hdp_topics)
        hdp_dfn = pd.DataFrame(hdp_df.unstack())
        hdp_dfn.reset_index(inplace=True)
        hdp_dfn.columns = ['token_id', 'topic_id', 'token_freq']
        self.db.put_table(hdp_dfn, 'hdp', if_exists='replace')

        # todo: Go the next step and extract topic with word with freqs above a thresh
        thresh = 0.0005
        # Sometimes it's easier to use SQL than to figure out how to something
        # like this in Pandas
        sql = """
        SELECT topic_id, GROUP_CONCAT(token_str, ' ') AS top_words
        FROM ( SELECT topic_id, token_id FROM hdp WHERE token_freq > {} ORDER BY topic_id, token_freq DESC )
        JOIN token USING (token_id)
        GROUP BY topic_id
        """.format(thresh)
        hdp_topics = pd.read_sql_query(sql, self.db.conn)
        self.db.put_table(hdp_topics, 'hdp_topics')

        thresh = 0.005  # Note this is different from what's in config.ini
Ejemplo n.º 29
0
    def identify_topics(self, labels, texts, verbose=False):
        if verbose:
            print('\tStart identifying topics ...')
        s_time = datetime.now()

        self.label_codes = np.unique(labels)
        for idx, l_code in enumerate(self.label_codes):
            if l_code != -1:
                all_tweets_of_cluster = " ".join(texts[labels == l_code])
                self.all_tweets_of_clusters.append(all_tweets_of_cluster)
                self.cleaned.append(all_tweets_of_cluster.split(' '))

        dictionary = corpora.Dictionary(self.cleaned)
        self.corpus = [
            dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned
        ]
        self._model = models.HdpModel(self.corpus, dictionary)
        self.num_topics = self._model.get_topics().shape[0]
        if self._model is not None:
            for i, topic in self._model.show_topics(formatted=True,
                                                    num_topics=self.num_topics,
                                                    num_words=10):
                self.topics.append(topic)

            for i, topic in self._model.show_topics(formatted=False,
                                                    num_topics=self.num_topics,
                                                    num_words=10):
                self.topics_not_formatted.append(topic)

        if len(self.topics) < 5:
            print()
        dur = datetime.now() - s_time
        if verbose:
            print('\tIdentifying topics was finished ({} seconds).'.format(
                dur.seconds))
        pass
Ejemplo n.º 30
0
def get_hdp(*args, **kwargs):
    return models.HdpModel(*args, **kwargs)