Esempio n. 1
0
def query_similarity(queries, corpus, method='tfidf', n_neighbors=2):
    dictionary, corpusdic = build_corpusdic(corpus)
    if method == 'lsi':
        mdl = models.LsiModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'tfidf':
        mdl = models.TfidfModel(corpusdic)
    elif method == 'rp':
        mdl = models.RpModel(corpusdic, num_topics=100)
    elif method == 'hdp':
        mdl = models.HdpModel(corpusdic, id2word=dictionary)
    elif method == 'lda':
        mdl = models.LdaModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'lem':
        mdl = models.LogEntropyModel(corpusdic)
    elif method == 'norm':
        mdl = models.NormModel(corpus, norm='l2')

    else:
        raise ValueError("There is an invalid model method in the input!")
    index = similarities.MatrixSimilarity(mdl[corpusdic])
    indx_list = []
    sim_list = []
    for query in queries:
        vec_bow = dictionary.doc2bow(query.lower().split())
        vec_lsi = mdl[vec_bow]  # convert the query to LSI space
        sims = index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        sims = sims[:n_neighbors]
        indx_, sim_ = np.array(sims).transpose()
        indx_list.append(indx_)
        sim_list.append(sim_)
    return indx_list, sim_list
Esempio n. 2
0
 def get_rp(self):
     docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
     model_rp = models.RpModel(docs_corpus, id2word=self.docs_dict)
     docs_rp = model_rp[docs_corpus]
     docs_vecs = np.vstack(
         [sparse2full(c, len(self.docs_dict)) for c in docs_rp])
     return docs_vecs
Esempio n. 3
0
    def create_gensim_model(self, corpus):
        """
        Create a gensim model

        Parameters
        ----------

        corpus : an object that satisfies a gensim TextCorpus

        Returns
        -------
        
        gensim corpus model
        """
        dictionary = corpus.get_dictionary()
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        if self.model_type == 'gensim_lsi':
            logger.info("Building gensim lsi model")
            model = models.LsiModel(corpus_tfidf,
                                    id2word=dictionary,
                                    num_topics=self.vec_size)
        elif self.model_type == 'gensim_lda':
            logger.info("Building gensim lda model")
            model = models.LdaModel(corpus_tfidf,
                                    id2word=dictionary,
                                    num_topics=self.vec_size,
                                    passes=10)
        else:
            logger.info("Building gensim random projection model")
            model = models.RpModel(corpus_tfidf,
                                   id2word=dictionary,
                                   num_topics=self.vec_size)
        return model[corpus_tfidf]
    def create_documents_view(self, corpus, ir_mode):
        dictionary, pdocs = self.create_dictionary(corpus)
        bow = self.docs2bows(corpus, dictionary, pdocs)
        loaded_corpus = corpora.MmCorpus('vsm_docs.mm')  # Recover the corpus

        if ir_mode == 1:
            model = [[(w[0], 1 + np.log2(w[1])) for w in v]
                     for v in bow]  # TF model
        elif ir_mode == 2:
            model = models.TfidfModel(loaded_corpus)  # TF IDF model
        elif ir_mode == 3:
            model = models.LdaModel(loaded_corpus)  # LDA model
        elif ir_mode == 4:
            model = models.LdaMulticore(loaded_corpus)  # LDA Multicore model
        elif ir_mode == 5:
            model = models.LsiModel(loaded_corpus)  # LSI model
        elif ir_mode == 6:
            model = models.RpModel(loaded_corpus)  # RP model
        elif ir_mode == 7:
            model = models.LogEntropyModel(
                loaded_corpus)  # LogEntropyModel model

        # tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus

        return model, dictionary
 def fit(self, X, y=None):
     """
     Fit the model according to the given training data.
     Calls gensim.models.RpModel
     """
     self.gensim_model = models.RpModel(corpus=X, id2word=self.id2word, num_topics=self.num_topics)
     return self
Esempio n. 6
0
def gensim_feature(corpus=None):

    # corpus参数样例数据如下:
    corpus = [["我", "来到", "成都", "春熙路"],
              ["今天", "在", "宽窄巷子", "耍", "了", "一天"],
              ["成都", "整体", "来说", "还是", "挺", "安逸", "的"],
              ["成都", "的", "美食", "真", "巴适", "惨", "了"]]
    dictionary = corpora.Dictionary(corpus)  # 构建语料词典

    # # 收集停用词和仅出现一次的词的id
    # stop_ids = [dictionary.token2id[stopword] for stopword in user_stop_word_list if stopword in dictionary.token2id]
    # once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
    # dictionary.filter_tokens(stop_ids + once_ids) # 删除停用词和仅出现一次的词
    # dictionary.compactify()  # 消除id序列在删除词后产生的不连续的缺口
    # dictionary.save('mycorpus.dict')  # 把字典保存起来,方便以后使用

    # 统计词频特征
    dfs = dictionary.dfs  # 词频词典
    for key_id, c in dfs.items():
        print(dictionary[key_id], c)

    # 转换成doc_bow
    doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus]

    # 生成tfidf特征
    tfidf_model = models.TfidfModel(dictionary=dictionary)  # 生成tfidf模型
    tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus]  # 将每doc_bow转换成对应的tfidf_doc向量

    # 生成lsi特征(潜在语义索引)
    lsi_model = models.LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100)  # 生成lsi model
    # 生成corpus of lsi
    lsi_corpus = [lsi_model[tfidf_doc] for tfidf_doc in tfidf_corpus]  # 转换成lsi向量

    # 生成lda特征(主题模型)
    lda_model = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100)  # 生成lda model
    # 生成corpus of lsi
    lda_corpus = [lda_model[tfidf_doc] for tfidf_doc in tfidf_corpus]  # 转换成lda向量

    # 生成随机映射(Random Projections,RP, 优点:减小空维度、CPU和内存都很友好)
    rp_model = models.RpModel(tfidf_corpus, num_topics=500)
    rp_corpus = [rp_model[tfidf_doc] for tfidf_doc in tfidf_corpus]  # 转换成随机映射tfidf向量

    # 分层狄利克雷过程(Hierarchical Dirichlet Process,HDP ,一种无参数贝叶斯方法)
    hdp_model = models.HdpModel(doc_bow_corpus, id2word=dictionary)
    hdp_corpus = [hdp_model[doc_bow] for doc_bow in doc_bow_corpus]  # 转换成HDP向量

    # 文档向量和词向量 (Doc2Vec and Word2Vec)
    tld_list = []
    for ind, line_list in enumerate(corpus):
        tld_list.append(TaggedDocument(line_list, tags=[str(ind)]))
    d2v_model = Doc2Vec(tld_list, min_count=2, window=3, size=100, sample=1e-3, negative=5,iter=15)
    # 由于Doc2vec的训练过程也可以同时训练Word2vec,所以可以直接获取两个模型,全部保存起来:
    # model.save(save_model_d2v_file_path)
    # model.save_word2vec_format(save_model_w2v_file_path, binary=True)

    # 将文本转换成向量矩阵
    docvecs = d2v_model.docvecs
    docvecs_matrix = np.asarray(docvecs)
    print(docvecs_matrix.shape)
Esempio n. 7
0
def rp():
    '''
    随机投影(Random Projections),RP旨在减少矢量空间维数。
    这是非常有效的方法,通过投掷一点随机性来近似文档之间的TfIdf距离。
    推荐的目标维度数百/千,取决于您的数据集。 
    '''
    corpus_tfidf = tfidf()
    rp_model = models.RpModel(corpus_tfidf, num_topics=2)
    corpus_rp = rp_model[corpus_tfidf]
    pprint(list(corpus_rp))
Esempio n. 8
0
 def init_rand_proj(self, tf_idf='No', num_topics=10):
     if tf_idf == 'Yes':
         corpus, BOW_user_queries = self.init_tfidf()
     else:
         corpus, BOW_user_queries = self.get_corpus()
     rand_proj = models.RpModel(corpus,
                                id2word=self.dictionary,
                                num_topics=num_topics)
     corpus_rand_proj = rand_proj[corpus]
     rand_proj_user_queries = rand_proj[BOW_user_queries]
     return corpus_rand_proj, rand_proj_user_queries
Esempio n. 9
0
def main(K, numfeatures, sample_file, num_display_words, outputfile):
    K_clusters = K
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures,
                                     min_df=2, stop_words='english',
                                     use_idf=True)

    text = []
    with open (sample_file, 'r', encoding='utf-8') as f:
        text = f.readlines()

    t0 = time()
    print("Extracting features from the training dataset using a sparse vectorizer")
    X = vectorizer.fit_transform(text)
    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    
    # mapping from feature id to acutal word
    id2words ={}
    for i,word in enumerate(vectorizer.get_feature_names()):
        id2words[i] = word

    t0 = time()
    print("Applying topic modeling, using LSI")
    print(str(K_clusters) + " topics")
    corpus = matutils.Sparse2Corpus(X,  documents_columns=False)
    rp = models.RpModel(corpus, num_topics = K_clusters, id2word=id2words)
    print("done in %fs" % (time() - t0))
        
    output_text = []
    for i, item in enumerate(rp.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
        """
        output_text.append("Topic: " + str(i))
        #weight = item[0]

        for weight,term in item:
            output_text.append( term + " : " + str(weight) )"""
        """
        for term, weight in item[1]:
            output_text.append( term + " : " + str(weight) )"""
        for term, weight in item[1]:
            output_text.append(",".join(["Topic "+str(i), term, str(weight)]))

    print ("writing topics to file:", outputfile)
    with open ( outputfile, 'w' ) as f:
        f.write('\n'.join(output_text))
Esempio n. 10
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : iterable of list of (int, number)
            Input corpus in BOW format.

        Returns
        -------
        :class:`~gensim.sklearn_api.rpmodel.RpTransformer`
            The trained model.

        """
        self.gensim_model = models.RpModel(corpus=X,
                                           id2word=self.id2word,
                                           num_topics=self.num_topics)
        return self
Esempio n. 11
0
def gensim_Corpus(corpus=None):
    dictionary = corpora.Dictionary(corpus)
    # 1 doc_bow转化为tfidf向量
    doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus]
    tfidf_model = models.TfidfModel(dictionary=dictionary)  # 生成tfidf模型
    tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus]
    print('doc_bow转换成对应的tfidf_doc向量:\n',tfidf_corpus)

    # 2 生成corpus of RP
    rp_model = models.RpModel(tfidf_corpus, num_topics=20)
    rp_corpus = [rp_model[tfidf_doc] for tfidf_doc in tfidf_corpus]  # 转换成随机映射tfidf向量
    print('RP:\n',rp_corpus)

    # 3 将RP模型存储到磁盘上
    savepath =r'../dataSet/files/rp_model.pkl'
    rp_file = open(savepath, 'wb')
    pkl.dump(rp_model, rp_file)
    rp_file.close()
    print('--- RP模型已经生成 ---')
Esempio n. 12
0
def topics(documents,
           dictionary,
           strategy='lda',
           num_topics=3,
           iterations=50,
           passes=1,
           **kwargs):
    """
    Strategies and best practices are:
    "lsi" - latent semantic indexing. Documents = tfidf_corpus. Num is 200-500 topics.
    "lda" - latent dirichlet analyisis. Documents = corpus. Num is expert driven.
    "rp" - Random projections. Documents = tfidf_corpus, Num is 100-10000
    "hdp" - Hierarchical Dirichlet Process = corpus. Num is not used.
    """
    if strategy == "lsi":
        model = models.LsiModel(documents,
                                id2word=dictionary,
                                num_topics=num_topics,
                                iterations=iterations,
                                passes=passes,
                                **kwargs)

    if strategy == "lda":
        model = models.LdaModel(documents,
                                id2word=dictionary,
                                num_topics=num_topics,
                                iterations=iterations,
                                passes=passes,
                                **kwargs)

    if strategy == "rp":
        model = models.RpModel(documents,
                               num_topics=num_topics,
                               iterations=iterations,
                               passes=passes,
                               **kwargs)

    if strategy == "hdp":
        model = models.HdpModel(documents, id2word=dictionary, **kwargs)
    results = model[documents]
    return model, results
Esempio n. 13
0
model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)


# In[112]:

model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus
lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model

model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
lsi_vec = model[tfidf_vec]


# In[114]:

model = models.RpModel(corpus_tfidf, num_topics=500)
model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
model = models.HdpModel(corpus, id2word=dictionary)


# ### Similarity Queries

# In[118]:

from gensim import corpora, models, similarities

dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm') # comes from the first tutorial, "From strings to vectors"
print(corpus)

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    def get_topic_model(corpus,
                        trans_method,
                        dictionary,
                        ntopics,
                        save_trans_model=False,
                        save_trans_name=None):
        """Obtain a topic model based on one of the transformation method

        :param corpus: the corpus, a list of bag of words
        :param trans_method: the method for transformation
        :param dictionaory: the gensim dictionary
        :param ntopics: number of topics
        :param save_trans_model: boolean if saving transformed
        :param save_trans_name: string, the file name of the
        transformed vector
        :returns: the topic/transformation model
        :rtype:

        """
        if trans_method == 'TFIDF':
            tfidf = models.TfidfModel(corpus)  # initialize a tfidf model
            if save_trans_model:
                if save_trans_name is not None:
                    tfidf.save(save_trans_name)
                else:
                    sys.exit('get_topic_model: no file name specified '
                             'for topic model!')
            return tfidf
        elif trans_method == 'LSI':
            tfidf = models.TfidfModel(corpus)  # initialize a tfidf model
            corpus_tfidf = tfidf[corpus]
            lsi = models.LsiModel(corpus_tfidf,
                                  id2word=dictionary,
                                  num_topics=ntopics)
            # corpus_lsi = lsi[corpus_tridf]
            if save_trans_model:
                if save_trans_name is not None:
                    lsi.save(save_trans_name)
                else:
                    sys.exit('get_topic_model: no file name specified '
                             'for topic model!')
            return lsi
        elif trans_method == 'LDA':
            lda = models.LdaModel(corpus,
                                  id2word=dictionary,
                                  num_topics=ntopics)
            if save_trans_model:
                if save_trans_name is not None:
                    lda.save(save_trans_name)
                else:
                    sys.exit('get_topic_model: no file name specified '
                             'for topic model!')
            return lda
        elif trans_method == 'HDP':
            hdp = models.HdpModel(corpus, id2word=dictionary)
            if save_trans_model:
                if save_trans_name is not None:
                    hdp.save(save_trans_name)
                else:
                    sys.exit('get_topic_model: no file name specified '
                             'for topic model!')
            return hdp
        elif trans_method == 'RP':
            tfidf = models.TfidfModel(corpus)  # initialize a tfidf model
            corpus_tfidf = tfidf[corpus]
            rp = models.RpModel(corpus_tfidf, num_topics=ntopics)
            if save_trans_model:
                if save_trans_name is not None:
                    rp.save(save_trans_name)
                else:
                    sys.exit('get_topic_model: no file name specified '
                             'for topic model!')
            return rp
        else:
            sys.exit('corpus_transform: topic method {0} is not valid!'.format(
                trans_method))
Esempio n. 15
0
def text_sim(processed_text_lst, fre_limit, mode):

    # step1:形成gensim所需的数据格式
    # print "##################step1:形成gensim所需的数据格式#########################"
    stoplist = []  # 可导入停用词
    texts = [[word for word in document.split() if word not in stoplist]
             for document in processed_text_lst]
    num_text = len(texts)
    # print "文档数量为:", num_text

    # step2:计算词频,过滤单词
    # print "###################step2:计算词频,过滤单词#########################"
    frequency = defaultdict(int)  # 构建一个字典对象
    # 遍历分词后的结果集,计算每个词出现的频率
    for text in texts:
        for token in text:
            frequency[token] += 1
    fre_limit = fre_limit  # 可变化,如选择频率大于1的词
    texts = [[token for token in text if frequency[token] > fre_limit]
             for text in texts]

    # step3:创建字典(单词与编号之间的映射)
    # print "####################step3:创建字典(单词与编号之间的映射)########################"
    dictionary = corpora.Dictionary(texts)
    # print "字典对象:", dictionary  # 打印字典对象
    # 输出格式为: Dictionary(12 unique tokens: ['time', 'computer', 'graph', 'minors', 'trees']...)
    # print "字典:", dictionary.token2id  # 打印字典,key为单词,value为单词的编号

    # step4:建立语料库
    # print "####################step4:建立语料库######################"
    corpus = [dictionary.doc2bow(text) for text in texts]  # 将每一篇文档转换为向量

    # step5:初始化模型
    # print "##################### step5:初始化模型######################"
    if mode == 'tfidf':
        tfidf = models.TfidfModel(
            corpus)  # 初始化一个tfidf模型,可以用它来转换向量(词袋整数计数)表示方法为新的表示方法(Tfidf 实数权重)
        # 将整个语料库转为tfidf表示方法
        corpus_convert = tfidf[corpus]
        index = similarities.MatrixSimilarity(corpus_convert,
                                              num_features=len(dictionary))
        # index = similarities.MatrixSimilarity(querypath, corpus_convert, len(dictionary))
        return corpus_convert, index

    elif mode == 'lsi':
        pass
        lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)
        corpus_convert = lsi[corpus]
        index = similarities.MatrixSimilarity(corpus_convert)
        return corpus_convert, index

    elif mode == 'rp':
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        rp = models.RpModel(corpus_tfidf, num_topics=100)
        corpus_convert = rp[corpus_tfidf]
        index = similarities.MatrixSimilarity(corpus_convert)
        return corpus_convert, index

    elif mode == 'lda':
        lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
        corpus_convert = lda[corpus]
        index = similarities.MatrixSimilarity(corpus_convert)
        return corpus_convert, index

    elif mode == 'hdp':
        hdp = models.HdpModel(corpus, id2word=dictionary)
        corpus_convert = hdp[corpus]
        index = similarities.MatrixSimilarity(corpus_convert)
        return corpus_convert, index
Esempio n. 16
0
    def text_represiontation(self, dict_save_path,text_lst=None, rep_type='tfidf'):

        # 字典创建
        dictionary = self.create_vocab(text_lst=text_lst, dict_save_path=dict_save_path)
        vocab = dictionary.token2id
        print "字典长度:", len(vocab)
        # 特征选择
        vectorizer, feature_matrix = self.build_feature_matrix(documents=text_lst, feature_type='frequency')
        # print feature_matrix
        # print type(feature_matrix)  #<class 'scipy.sparse.csr.csr_matrix'>
        # print feature_matrix.shape  #(176, 2789)
        #将矩阵转换为语料
        corpus = gensim.matutils.Sparse2Corpus(feature_matrix.T)
        # print corpus        #<gensim.matutils.Sparse2Corpus object at 0x00000000074C2A58>
        # print type(corpus)  #<class 'gensim.matutils.Sparse2Corpus'>
        #文本表示
        if rep_type=="tfidf":
            #将语料转化为tfidf
            tfidf = models.TfidfModel(corpus)  # 初始化一个tfidf模型,可以用它来转换向量(词袋整数计数)表示方法为新的表示方法(Tfidf 实数权重)
            # 将整个语料库转为tfidf表示方法
            corpus_convert = tfidf[corpus]
            # print len(corpus_convert)
            # 打印文档的tfidf表示
            # print "++++打印文档的tfidf表示+++++"
            # for doc in corpus_convert:
            #     print(doc)
            numpy_matrix = gensim.matutils.corpus2csc(corpus_convert, num_terms=len(dictionary.token2id))
            print numpy_matrix.T.shape
        elif rep_type=='lsi':
            num_topics = 100
            lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)
            corpus_convert = lsi[corpus]
            # print len(corpus_convert)
            print "++++打印文档的lsi表示+++++"
            # for doc in corpus_convert:
            #     print(doc)
            numpy_matrix = gensim.matutils.corpus2csc(corpus_convert, num_terms=num_topics)
            print numpy_matrix.T.shape
        elif rep_type=='rp':
            num_topics = 100
            tfidf = models.TfidfModel(corpus)
            corpus_tfidf = tfidf[corpus]
            rp = models.RpModel(corpus_tfidf, num_topics=num_topics)
            corpus_convert = rp[corpus_tfidf]
            # print len(corpus_convert)
            print "++++打印文档的rp表示+++++"
            # for doc in corpus_convert:
            #     print(doc)
            #     break
            numpy_matrix = gensim.matutils.corpus2csc(corpus_convert, num_terms=num_topics)
            print numpy_matrix.T.shape
        elif rep_type == 'lda':
            num_topics = 100
            lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)
            corpus_convert = lda[corpus]
            print "++++打印文档的lda表示+++++"
            print len(corpus_convert)
            # for doc in corpus_convert:
            #     print(doc)
            #     break
            numpy_matrix = gensim.matutils.corpus2csc(corpus_convert, num_terms=num_topics)
            print numpy_matrix.T.shape
        elif rep_type == 'hdp':
            num_topics = 100
            hdp = models.HdpModel(corpus, id2word=dictionary)
            corpus_convert = hdp[corpus]
            # print len(corpus_convert)
            print "++++打印文档的hdp表示+++++"
            # for doc in corpus_convert:
            #     print(doc)
            #     break
            numpy_matrix = gensim.matutils.corpus2csc(corpus_convert, num_terms=num_topics)
            print numpy_matrix.T.shape
        else:
            raise Exception("Wrong feature type entered. Possible values: 'tfidf', 'lsi', 'rp','lda','hdp")
        return numpy_matrix.T
Esempio n. 17
0
#model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
#lsi_vec = model[tfidf_vec]
'''
See the gensim.models.lsimodel documentation for details on how to make LSI gradually “forget” old observations in
infinite streams. If you want to get dirty, there are also parameters you can tweak that affect speed vs. memory
footprint vs. numerical precision of the LSI algorithm.
gensim uses a novel online incremental streamed distributed training algorithm (quite a mouthful!), which I published
 in [5]. gensim also executes a stochastic multi-pass algorithm from Halko et al. [4] internally, to accelerate
 in-core part of the computations. See also Experiments on the English Wikipedia for further speed-ups by distributing
 the computation across a cluster of computers.

Random Projections, RP aim to reduce vector space dimensionality. This is a very efficient
(both memory- and CPU-friendly) approach to approximating TfIdf distances between documents, by throwing in a little
randomness. Recommended target dimensionality is again in the hundreds/thousands, depending on your dataset.
'''
model = models.RpModel(corpus, num_topics=500)
'''
Latent Dirichlet Allocation, LDA is yet another transformation from bag-of-words counts into a topic space of lower
dimensionality. LDA is a probabilistic extension of LSA (also called multinomial PCA), so LDA’s topics can be
interpreted as probability distributions over words. These distributions are, just like with LSA, inferred
 automatically from a training corpus. Documents are in turn interpreted as a (soft) mixture of these topics
 (again, just like with LSA).
'''
model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
'''
gensim uses a fast implementation of online LDA parameter estimation based on [2], modified to run in distributed mode
on a cluster of computers.
Hierarchical Dirichlet Process, HDP is a non-parametric bayesian method (note the missing number of requested topics):
'''
model = models.HdpModel(corpus, id2word=dictionary)
Esempio n. 18
0
    def GetTops(self):

        documents = []
        results = []
        tokens = []
        filtered_tokens = []
        bag_of_words = []
        frequency = []

        ## setup stopwords
        #  NLTK's default stopwords
        default_stopwords = set(nltk.corpus.stopwords.words('english'))
        #  We're adding some on our own
        custom_stopwords = set((u'``', u'and', u'but'))  #these are just some temp words
        unusual_stopwords = set(('"\\"\\"\\"', '"a', '\"\"\""'))
        stop1 = set({'wherein','base','said','therewith','one','two','first','second','third'})
        stop2 = set({'includes','like','inolves','identifies','forming','main','combined','portion','especially'})
        stop3 = set({'central'})
        unique_stopwords =  stop1 | stop2 | stop3

        all_stopwords = default_stopwords | custom_stopwords | unusual_stopwords | unique_stopwords

        num_response = len(self.jsonData)

        self.text = regex.sub(r'[^\w]', ' ', self.text)
        self.text = self.text.lower()

        for i in range(0, num_response):
            documents.append(regex.sub(r'[^\w]', ' ',self.jsonData[i]['abstract'].replace('"', '').lower()))

        ##
        ## this section is for Gensim functions
        ## a lot of the code is from Gensim tutorial
        # remove common words and tokenize

        texts = [[word for word in document.lower().split() if word not in all_stopwords]
                   for document in documents]

        #remove words that appear only once
        from collections import defaultdict

        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

        texts = [[token for token in text if frequency[token] > 1]
             for text in texts]

        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(corpus)
        Rpmodel = models.RpModel(corpus, num_topics=500)

		#LDA_models created
        Ldamodel = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=1, update_every=1, chunksize=100,
                                    passes=10, gamma_threshold=0.001)
        tab = Ldamodel.show_topics(num_words=5, formatted=False)

        model_length = len(tab)


        for i in range(0, model_length):
                x = resultmodels()
                for (key, val) in tab:
                    for (key, value) in val:
                        self.LDA_topic_model.append(i)
                        self.LDA_topic_words.append(key)
                        self.LDA_topic_scores.append(value)
                        x.words.append(key)
                        x.score.append(value)
                self.LDA_models.append(x)

		#Hdp model created
        Hdpmodel = models.HdpModel(corpus, id2word=dictionary)
        tab = Hdpmodel.show_topics(num_topics=1, num_words=5, formatted=False)

        model_length = len(tab)

        for i in range(0, model_length):
                x = resultmodels()
                for (key, val) in tab:
                    for (key, value) in val:
                        self.HDP_topic_model.append(i)
                        self.HDP_topic_words.append(key)
                        self.HDP_topic_scores.append(value)
                        x.words.append(key)
                        x.score.append(value)
                self.HDP_models.append(x)
Esempio n. 19
0
def RP(corpus_tfidf):

    RP_model = models.RpModel(corpus_tfidf, num_topics=2)

    return RP_model
Esempio n. 20
0
def build_doc_similarity_table(corpus,
                               method='tfidf',
                               n_neighboors=3,
                               return_similarity=True,
                               batch_size=5000,
                               doc_dtype=np.int64,
                               score_dtype=np.float16):
    """
    Batch processor wrapper for ``find_similar_docs`` to
    find `n_neighboors` similar docs to all the docs
    *Note 1*: increasing ``batch_size`` can increase memory usage, but can be
    faster
    """

    # This structure should be paralleized to doc multiple CPUs

    dictionary, corpusdic = build_corpusdic(corpus)

    if method == 'lsi':
        lsi = models.LsiModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'tfidf':
        lsi = models.TfidfModel(corpusdic)
    elif method == 'rp':
        lsi = models.RpModel(corpusdic, num_topics=100)
    elif method == 'hdp':
        lsi = models.HdpModel(corpusdic, id2word=dictionary)
    elif method == 'lda':
        lsi = models.LdaModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'lem':
        lsi = models.LogEntropyModel(corpusdic)
    elif method == 'norm':
        lsi = models.NormModel(corpus, norm='l2')

    else:
        raise ValueError("There is an invalid model method in the input!")

    #Determing the similarities between different documents
    index = similarities.MatrixSimilarity(lsi[corpusdic])
    vec_lsi = lsi[corpusdic]
    doc_matrix = index[vec_lsi]

    doc_count = doc_matrix.shape[0]

    similarity_table = np.zeros((doc_count, n_neighboors)).astype(doc_dtype)

    similarity_score = None

    if return_similarity:
        similarity_score = np.zeros(
            (doc_count, n_neighboors)).astype(score_dtype)

    start_ = 0
    while (start_ < doc_count):

        end_ = start_ + batch_size

        if (end_ > doc_count):
            end_ = doc_count

        query_index = np.arange(start_, end_)
        query_matrix = doc_matrix[query_index]

        (similarity_table[query_index], similarities_) = \
            find_similar_docs(doc_matrix, query_matrix,
                               n_neighboors=n_neighboors,
                               method='nearest', return_similarity=return_similarity)

        if return_similarity:
            similarity_score[query_index] = similarities_

        start_ = start_ + batch_size

    return (similarity_table, similarity_score)
Esempio n. 21
0
def gen_data(train, test, no_below, no_above, num_topics=300):
    mydictionary = My_dictionary(train,
                                 test,
                                 no_below=no_below,
                                 no_above=no_above)
    train_corpus = mydictionary.train_corpus
    test_corpus = mydictionary.test_corpus
    dictionary = mydictionary.dictionary

    save_load(mode='save',
              files={
                  'train_corpus': train_corpus,
                  'test_corpus': test_corpus,
                  'dictionary': dictionary
              })
    #print(len(corpus),corpus[:10])

    #tfidf
    print('tfidf...')
    tfidf = models.TfidfModel(train_corpus)  # 第一步--初始化一个模型
    train_corpus_tfidf = tfidf[train_corpus]
    test_corpus_tfidf = tfidf[test_corpus]  #对整个语料库实施转换
    train_tfidf_array = corpustoarray(train_corpus_tfidf)
    test_tfidf_array = corpustoarray(test_corpus_tfidf)

    pd.DataFrame(train_tfidf_array).to_csv('file\\train_tfidf.csv')
    pd.DataFrame(test_tfidf_array).to_csv('file\\test_tfidf.csv')

    #lsi      200-500的num_topics维度为“金标准”
    print('lsi...')
    lsi = models.LsiModel(train_corpus_tfidf,
                          id2word=dictionary,
                          num_topics=num_topics)  # 初始化一个LSI
    train_corpus_lsi = lsi[
        train_corpus_tfidf]  # 在原始语料库上加上双重包装: bow->tfidf->fold-in-lsi
    test_corpus_lsi = lsi[test_corpus]
    train_lsi_array = corpustoarray(train_corpus_lsi)
    test_lsi_array = corpustoarray(test_corpus_lsi)
    pd.DataFrame(train_lsi_array).to_csv('file\\train_lsi.csv')
    pd.DataFrame(test_lsi_array).to_csv('file\\test_lsi.csv')

    #lsi=models.LsiModel.load('model\\model.lsi')

    #RP
    print('rp...')
    rp = models.RpModel(train_corpus_tfidf,
                        id2word=dictionary,
                        num_topics=num_topics)
    train_corpus_rp = rp[train_corpus_tfidf]
    test_corpus_rp = rp[test_corpus]
    train_rp_array = corpustoarray(train_corpus_rp)
    test_rp_array = corpustoarray(test_corpus_rp)
    pd.DataFrame(train_rp_array).to_csv('file\\train_rp.csv')
    pd.DataFrame(test_rp_array).to_csv('file\\test_rp.csv')
    '''
	#LDA   2003    LDA最早由Blei, David M.、吴恩达和Jordan, Michael I于2003年提出    有bug
	lda = models.LdaSeqModel(corpus, id2word=dictionary,num_topics=300)
	corpus_lda=lda[corpus]   
	'''
    #HDP    2011   Wang, Paisley, Blei:  http://proceedings.mlr.press/v15/wang11a/wang11a.pdf
    print('hdp...')
    hdp = models.HdpModel(train_corpus, id2word=dictionary)
    train_corpus_hdp = hdp[train_corpus]
    test_corpus_hdp = hdp[test_corpus]
    train_hdp_array = corpustoarray(train_corpus_hdp)
    test_hdp_array = corpustoarray(test_corpus_hdp)
    pd.DataFrame(train_hdp_array).to_csv('file\\train_hdp.csv')
    pd.DataFrame(test_hdp_array).to_csv('file\\test_hdp.csv')

    #Log Entropy Model   2015
    print('log...')
    log = models.LogEntropyModel(train_corpus)
    train_corpus_log = log[train_corpus]
    test_corpus_log = log[test_corpus]
    train_log_array = corpustoarray(train_corpus_log)
    test_log_array = corpustoarray(test_corpus_log)
    pd.DataFrame(train_log_array).to_csv('file\\train_log.csv')
    pd.DataFrame(test_log_array).to_csv('file\\test_log.csv')
Esempio n. 22
0
 def randomProj(self,modelPath,indexPath):
     rp = models.RpModel(self.corpus)
     rp.save(modelPath)
     index = similarities.MatrixSimilarity(rp[self.corpus]) # transform corpus to LSI space and index it
      .save(indexPath)
Esempio n. 23
0
class MyCorpus(object):
    def __iter__(self):
        for line in open('../Save/scopus_list_txt.txt'):
            #print(line)
            yield dictionary.doc2bow(line.lower().split())

corpus = [dictionary.doc2bow(text) for text in scopus_list_txt]

corpora.MmCorpus.serialize('../Save/scopus_corpus.mm', corpus)

Scopus_corpus = corpora.MmCorpus('../Save/scopus_corpus.mm')
tfidf = models.TfidfModel(Scopus_corpus)
corpus_tfidf = tfidf[Scopus_corpus]

RP = models.RpModel(corpus=Scopus_corpus, num_topics=100, id2word=dictionary)
print(RP['valu'])

RP.save('../Save/modelRP.rp_model')


#hdp = gensim.models.hdpmodel.HdpModel(Scopus_corpus, id2word = dictionary)
#hdp.optimal_ordering()
#hdp.save('../Save/modelHDP.hdp')
#x = hdp.show_topics(1000, 10)
#print(x)




def build_rp_model(bag_of_ids_vals_generator, dictionary, number_of_topics):
    """Reads from generator of lists of pairs (word_id: value) and builds
    RP Projections model for given number_of_topics."""
    return models.RpModel(bag_of_ids_vals_generator,
                          id2word=dictionary,
                          num_topics=number_of_topics)
print "loading data..."
print time.strftime("%H:%M:%S", time.localtime())

c = SvmLightCorpus( input_file )

print "starting tf-idf..."
print

tfidf = models.TfidfModel( c )
c_tfidf = tfidf[c]

print "running RP..."
print time.strftime("%H:%M:%S", time.localtime())
print

rp = models.RpModel( c_tfidf, num_topics = num_topics )

print "converting corpus to RP..."
print time.strftime("%H:%M:%S", time.localtime())

c_rp = rp[c_tfidf] 

print "saving..."
print

SvmLightCorpus.serialize( output_file, c_rp  )

print "done."
print time.strftime("%H:%M:%S", time.localtime())
Esempio n. 26
0
 def RP(self, **config):
     num_topics = config['num_topics']
     self.model = models.RpModel(self._dictionary.corpus,
                                 num_topics=num_topics)
def build_rp_model(corpus,dictionary,tfidf_save_path,rp_save_path):
    tfidfmodel = load_tfidf_model(tfidf_save_path)
    corpus_tfidf = tfidfmodel[corpus]
    rp = models.RpModel(corpus_tfidf, num_topics=500)  
    rp.save(rp_save_path) 
 def RPModel(self, tf_model, num_topics):
     rp = models.RpModel(tf_model, num_topics=num_topics)
     return rp[tf_model]