def siphon_best_match_from_set(sentence, set):
    corpora_documents = []
    for item_text in set:
        item_seg = list(jieba_nlp.generate_jieba_cut(item_text))
        corpora_documents.append(item_seg)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    similarity = similarities.Similarity(
        BASE_DIR + '/vendor/dataset/gensim/Similarity-tfidf-index',
        corpus_tfidf,
        num_features=600)
    test_cut_raw_1 = list(jieba_nlp.generate_jieba_cut(sentence))
    test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)
    similarity.num_best = 2
    test_corpus_tfidf_1 = tfidf_model[test_corpus_1]
    tfidf_simi = similarity[test_corpus_tfidf_1]
    lsi = models.LsiModel(corpus_tfidf)
    corpus_lsi = lsi[corpus_tfidf]
    similarity_lsi = similarities.Similarity(
        BASE_DIR + '/vendor/dataset/gensim/Similarity-LSI-index',
        corpus_lsi,
        num_features=400,
        num_best=2)
    test_cut_raw_3 = list(jieba_nlp.generate_jieba_cut(sentence))
    test_corpus_3 = dictionary.doc2bow(test_cut_raw_3)
    test_corpus_tfidf_3 = tfidf_model[test_corpus_3]
    test_corpus_lsi_3 = lsi[test_corpus_tfidf_3]
    # lsi.add_documents(test_corpus_lsi_3) #更新LSI的值
    lsi_simi = similarity_lsi[test_corpus_lsi_3]
    return {'tfidf': tfidf_simi, 'lsi_simi': lsi_simi}
 def CalSim(self, test_document, Type, best_num):
     '''Calculate similarities between test document wth all news(articles/documents).
     # Arguments:
         test_document: List of raw documents.
         Type: Models of calculating similarities.
         best_num: refer to 'num_best' parameter in Gensim module.
     '''
     if Type == 'Similarity-tfidf-index':
         tfidf = models.TfidfModel(self._BowVecOfEachDoc)
         tfidfVec = tfidf[self._BowVecOfEachDoc]
         self._num_features = len(self._dictionary.token2id.keys())
         self._similarity = similarities.Similarity(Type, tfidfVec, \
             num_features=self._num_features,num_best=best_num)
         test_cut_raw = list(jieba.cut(test_document))
         test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw)
         self._test_BowVecOfEachDoc = tfidf[test_BowVecOfEachDoc]
     elif Type == 'Similarity-LSI-index':
         lsi_model = models.LsiModel(self._BowVecOfEachDoc)
         corpus_lsi = lsi_model[self._BowVecOfEachDoc]
         self._num_features = len(self._dictionary.token2id.keys())
         self._similarity = similarities.Similarity(Type, corpus_lsi, \
             num_features=self._num_features,num_best=best_num)
         test_cut_raw = list(jieba.cut(test_document))
         test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw)
         self._test_BowVecOfEachDoc = lsi_model[test_BowVecOfEachDoc]
     self.Print_CalSim()
     IdLst = []
     SimRltLst = []
     SimTxLst = []
     for Id, Sim in self._similarity[self._test_BowVecOfEachDoc]:
         IdLst.append(Id)
         SimRltLst.append(Sim)
         SimTxLst.append(self._raw_documents[Id])
     return IdLst, SimTxLst, SimRltLst
Beispiel #3
0
def main():

    print("Building dictionary...")
    dictionary = corpora.Dictionary(doc[1] for doc in
            TrecReader(config.filenames))
    print("Generating LSI model...")
    corpus = TrecCorpus(config.filenames, dictionary)
    lsi = models.LsiModel(corpus, id2word=dictionary,
            num_topics=config.num_topics)
    print("Building index...")
    index_tempfile = get_tmpfile(config.tmpfile)
    index = similarities.Similarity(index_tempfile, lsi[corpus],
            num_features=lsi.num_topics)
    topics = list(get_topics(config.topicfile, config.topic_type))
    print("Evaluating LSI model...")
    evaluate(corpus, topics, dictionary, index, lsi, config.lsi_file,
            config.lsi_run, config.processor, config.settings_file)
    print("Generating LDA model...")
    lda = models.LdaModel(corpus, id2word=dictionary,
            num_topics=config.num_topics)
    print("Building index...")
    index_tempfile = get_tmpfile(config.tmpfile)
    index = similarities.Similarity(index_tempfile, lda[corpus],
            num_features=lda.num_topics)
    print("Evaluating LDA model...")
    evaluate(corpus, topics, dictionary, index, lda, config.lda_file,
            config.lda_run, config.processor, config.settings_file)

    if parmenides_processor:
        cleanup()
def create_index(corpus_path,
                 output_path,
                 model_path,
                 lda=False,
                 lsi=False,
                 tfidf=False,
                 hdp=False):
    """Creates an index specified by the parameters & saves to output directory

    Parameters:
        corpus_path: the path to the corpus directory (os.path)
        output_path: the directory path where index(s) will be saved (os.path)
                     Note indexes each need their own folder
        model_path: the directory path with the models to be used (os.path)
                    The model path should have a corpus.dict and corpus.mm too
                    Use create_models.py
        name: the name of the index (str)
        lda: if True will create an index based on the lda model (boolean)
        lsi: if True will create an index based on the lsi model (boolean)
        tfidf: if True will create an index based on the tfidf model (boolean)
        hdp: if True will create an index based on hdp model (boolean)
    """
    dictionary = corpora.Dictionary.load(os.path.join(model_path,
                                                      "corpus.dict"))
    mc = corpora.MmCorpus(os.path.join(model_path, "corpus.mm"))
    # depending on the model the number of features changes
    tfidf_model = models.TfidfModel.load(os.path.join(model_path,
                                                      "model.tfidf"))
    if tfidf:
        op = os.path.join(output_path, "tfidf")
        index = similarities.Similarity(op,
                                        tfidf_model[mc],
                                        num_features=len(dictionary))
        index.save(os.path.join(output_path, "index.tfidf"))
    if lda:
        model = models.LdaModel.load(os.path.join(model_path, "model.lda"))
        op = os.path.join(output_path, "lda")
        index = similarities.Similarity(op,
                                        model[mc],
                                        num_features=model.num_topics)
        index.save(os.path.join(output_path, "index.lda"))
    if lsi:
        model = models.LsiModel.load(os.path.join(model_path, "model.lsi"))
        op = os.path.join(output_path, "lsi")
        index = similarities.Similarity(op,
                                        model[tfidf_model[mc]],
                                        num_features=model.num_topics)
        index.save(os.path.join(output_path, "index.lsi"))
    if hdp:
        model = models.HdpModel.load(os.path.join(model_path, "model.hdp"))
        op = os.path.join(output_path, "hdp")
        index = similarities.Similarity(op,
                                        model[mc],
                                        num_features=model.m_T)
        index.save(os.path.join(output_path, "index.hdp"))
def gensim_tfidf_simi(sentence, features=400, best=2):
    """
    文本相似度匹配,存在未数据未对齐异常
    :param sentence:
    :param features:
    :param best:
    :return:
    """
    rows = decode_rows_pickle()
    corpora_documents = []
    for row in rows:
        item = list(jieba_nlp.generate_jieba_cut(row))
        corpora_documents.append(item)
    dictionary = corpora.Dictionary(corpora_documents)
    dictionary.compactify()
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    similarity = similarities.Similarity(
        BASE_DIR + '/vendor/dataset/gensim/Similarity-tfidf-index',
        corpus_tfidf,
        num_features=features,
        num_best=best)
    cut_raw = list(jieba_nlp.generate_jieba_cut(sentence))
    test_corpus = dictionary.doc2bow(cut_raw)
    test_corpus_tfidf = tfidf_model[test_corpus]
    return similarity[test_corpus_tfidf]
Beispiel #6
0
def create_similarity_index(_type,_path):
    
    print " creating similarity index %s"%_type
    print " loading corpus from %s"%_path
    corpus=corpora.MmCorpus(_path+"/"+_type+".mm")
    print "",corpus

    # get number of features from dictionnary length
    # if (_type == "txt"):
    #     dic_path=_path+"/txt.dict"
    # else:
    dic_path=_path+"/"+_type+".dict"

    _num_features=len(corpora.Dictionary.load(dic_path))
    print " getting similarity computed for %d"%_num_features
    file_path=_path+"/"+_type+".index"

    # build the index
    index = similarities.Similarity(file_path, corpus, num_features=_num_features) 
    index.save(file_path)

    print " similarities saved as %s"%file_path
    print 

    return index
Beispiel #7
0
def CorpusCluster(indexpath, corpuspath, dirpath, threshold, corpusday):

    global corekeys
    global indexlist
    indexlist = ReadColumn(indexpath, 0)  # Read Index of Seeds
    print "indexlist == "
    print indexlist

    # Read Corpus when needed !
    class ReadCorpus(object):
        def __iter__(self):
            for line in open(corpuspath):
                yield line.split()

    corp = ReadCorpus()  # Read corpus as Corp
    dictionary = corpora.Dictionary(corp)  # Create dictionary
    corpus = [dictionary.doc2bow(text) for text in corp]  # get bag-of-word
    tfidf = models.TfidfModel(corpus)  # Create TF-IDF Model
    corpus_tfidf = tfidf[corpus]
    # Calculate Similarities
    index = similarities.Similarity(dirpath + '/index',
                                    corpus,
                                    num_features=len(dictionary))
    i = 0
    dictcluster = {}
    for text in corpus_tfidf:
        dict_tmp = {}
        sims_tmp = list(index[text])
        #print sims_tmp
        # Use Threshold carefully !
        dict_tmp[int(indexlist[i])] = [
            int(indexlist[k]) for k, x in enumerate(sims_tmp)
            if x >= (max(sims_tmp) - threshold)
        ]
        #print dict_tmp
        if i == 0:
            dictcluster = dict_tmp  # Initialization Dictionary Cluster
        else:
            dictcluster = DictUnin(dictcluster,
                                   dict_tmp)  #Update Dictionary Cluster
        i += 1
#print dictcluster
    print "Dict_Cluster"
    print dictcluster
    corekeys = dictcluster.keys()
    initialindex = Txt2Dict(indexpath)
    print "initialindex"
    print initialindex
    updateindex = UpdateCluter(dictcluster, initialindex)
    print "updateindex"
    print updateindex
    corekeys = updateindex.keys()

    #print "updateindex +++++ "
    #print updateindex

    Dict2Txt2(updateindex, indexpath)  # Output seedindex into txt

    WriteSeedCorpus(corpuspath, dirpath + '/seedcorpus_' + rundate +
                    '.txt')  # Output seedcorpus into txt
Beispiel #8
0
def disambiguate_by_text_sim(validate_data, corr=0.3):
    res_dict = {}
    print('不同名作者数', len(validate_data))
    for i, author in enumerate(validate_data.keys()):
        author_papers = validate_data[author]
        if len(author_papers) == 0:
            res_dict[author] = []


#        print(i,author,len(author_papers))
        else:
            paper_words = get_papar_words(author_papers)
            dictionary = corpora.Dictionary(paper_words)
            bow_corpus = [dictionary.doc2bow(wl)
                          for wl in paper_words]  # 语料向量化
            tfidf = models.TfidfModel(bow_corpus)  # 基于向量化的语料构建tfidf模型

            index = similarities.Similarity('E:\\gensim_test',
                                            tfidf[bow_corpus], len(dictionary))
            sims = index[tfidf[bow_corpus]]  # 计算相似性矩阵
            i_cluster = graph_sim_matrix(sims, corr)
            author_cluster = [[
                author_papers[index]['id'] for index in l_inside
            ] for l_inside in i_cluster]
            #            res_realx={}
            #            res_realx[author]=res_real[author]
            #            print(author,'pairwise-f1',pairwise_f1(res_realx,{author:author_cluster}))
            print(i, author, '文章数', len(author_papers), '消歧后作者数',
                  len(author_cluster))
            res_dict[author] = author_cluster
    return res_dict
 def init_similarity(self):
     #l = (self.lenth()//3) if (self.lenth()//3)<2 else 2
     #print("\n> l=%s"%l)
     self.similarity = similarities.Similarity('Similarity-tfidf-index',
                                               self.corpus_tfidf,
                                               num_features=900)
     self.similarity.num_best = 5
Beispiel #10
0
    def similarity(self):
        corpora_documents = []
        stopwords = {}.fromkeys(
            [line.rstrip() for line in open('chineseStopWords.txt')])
        # 文本处理
        for item_text in self.documents:
            item_seg = list(jieba.cut(item_text))  #分词
            words = []
            for seg in item_seg:
                if seg not in stopwords:
                    words.append(seg)  #去停词
            corpora_documents.append(words)
        # #生成字典和向量语料
        dictionary = corpora.Dictionary(corpora_documents)
        # 通过下面一句得到语料中每一篇文档对应的稀疏向量(这里是bow向量)
        corpus = [dictionary.doc2bow(text) for text in corpora_documents]
        #corpus是一个返回bow向量的迭代器。下面代码将完成对corpus中出现的每一个特征的IDF值的统计工作
        tiidf_model = models.TfidfModel(corpus)
        corpus_tfidf = tiidf_model[corpus]

        self.sim = similarities.Similarity('Similarity-tfidf-index',
                                           corpus_tfidf,
                                           num_features=600)
        self.sim.num_best = self.top_num  #如果等于3,则返回最相似的3个结果
        sentence_cut_temp = list(jieba.cut(self.sentence))
        sentence_cut = []
        for word in sentence_cut_temp:
            if word not in stopwords:
                sentence_cut.append(word)
        sentence_cut_corpus = dictionary.doc2bow(sentence_cut)
        self.sentence_sim = tiidf_model[sentence_cut_corpus]
        self.resultShow()
Beispiel #11
0
    def fit(self,corpus):
        """
        Fit a document similarity model

        Parameters
        ----------

        corpus : object
           a corpus object that follows DefaultJsonCorpus

        Returns
        -------
        
        trained DocumentSimilarity object
        """
        if self.model_type == 'sklearn_nmf':
            model = self.create_sklearn_model(corpus)
        else:
            model = self.create_gensim_model(corpus)

        self.index = similarities.Similarity(self.work_folder+"/gensim_index",model,self.vec_size)
        self.index_annoy = annoy.AnnoyIndex(self.vec_size, metric='angular')
        for i, vec in enumerate(model):
            self.index_annoy.add_item(i, list(gensim.matutils.sparse2full(vec, self.vec_size).astype(float)))
        self.index_annoy.build(self.annoy_trees)
        self.seq2meta = {}
        self.id2meta = {}
        for j in corpus.get_meta():
            self.seq2meta[j['corpus_seq_id']] = j
            self.id2meta[j['id']] = j
        return self
def generateSimilarityIndex(corpus, num_topics=100):
    ##############################################################
    # Create TFIDF and LSI Models on the corpus
    ##############################################################
    tfidfModel = models.TfidfModel(corpus)
    corpus_tfidf = tfidfModel[corpus]

    # Reduce to 100 dimensions
    lsiModel = models.LsiModel(
        corpus_tfidf, id2word=dictionary,
        num_topics=num_topics)  # initialize an LSI transformation
    # lsi_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=10)  # initialize an LSI transformation
    # lsi_model = models.HdpModel(corpus_tfidf, id2word=dictionary)  # initialize an LSI transformation
    corpus_lsi = lsiModel[
        corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

    # Print the topics generated by the lsi model
    lsiModel.print_topics()

    # Create an index on which if we hit we will get the
    # similarity of the hitting object with all these documents
    # in LSI space

    # In memory computation - better for small datasets that fit in memory
    # index = similarities.MatrixSimilarity(corpus_lsi)  # transform corpus to LSI space and index it

    # Non In memory computation - better for big datasets that dont fit in memory
    index_temp = get_tmpfile(
        "lsimodel")  # create a temporary file named lsimodel to save things
    # Note that you need to give num_topics again here as num_features
    index = similarities.Similarity(
        index_temp, corpus_lsi,
        num_features=num_topics)  # transform corpus to LSI space and index it

    return [index, tfidfModel, lsiModel]
Beispiel #13
0
    def fit(self, db):
        '''
        INPUT: connection to database with recipes, restaurants data
        OUTPUT: fit model, index

        Creates a dictionary and model for recommender system. Given database
        connection, find all recipe ingredient lists, vectorize, build corpus
        and dictionary, fit model and create index.
        '''
        documents = self._prepare_documents(db)
        texts = self._clean_text(documents)

        if self.model.__init__.im_class == models.tfidfmodel.TfidfModel:
            # Vectorize and store recipe text
            self.dictionary = corpora.Dictionary(texts)
            # convert to BOW
            self.corpus = [self.dictionary.doc2bow(text) for text in texts]

            for i in self.dictionary.iterkeys():
                self.dictionary_len += 1

            self.model = self.model(self.corpus)
            # prepare for similarity queries
            self.index = similarities.SparseMatrixSimilarity(
                self.model[self.corpus], num_features=self.dictionary_len)

        else:  # word2vec
            self.model = models.Word2Vec.load('/mnt/word2vec/words', mmap='r')
            doc_vectors = self._create_doc_vectors(texts)
            self.index = similarities.Similarity('/mnt/word2vec/index',
                                                 doc_vectors,
                                                 num_features=300)
Beispiel #14
0
   def __calcSim(self,pic=0.02,process=0):
       # time0=time.time()
        self.tfidf_model = models.TfidfModel(self.new_corpus)  
        self.corpus_tfidf = self.tfidf_model[self.new_corpus]      
        sim_t=similarities.Similarity('Similarity-tfidf-index', self.corpus_tfidf, num_features=1000000,num_best=200)
        i=0
        maxAve = 0
        maxIndex = 0 
 #       print(["进程:",process,len(self.source_seg),len(sim_t),self.title,self.trade,self.classes])
        for n in sim_t:
           try:     
                # ns = [(str(kk[0]),kk[1]) for kk in n if kk[1]>pic]
                cl = [kk[1] for kk in n if kk[1]>pic] 
                ave = 0   
                if len(cl)>0: ave =  stat.mean(cl)      
              # print("sim_t -------:第%s个 平均值:%s 时间:%s " % (str(i),ave,str(time.time()-time0)))             
                #最短关键字要求
                if self.source_seg[i] and "dseg" in self.source_seg[i].keys():
                   key_num = len(self.source_seg[i]["dseg"])
                else:
                   key_num=0 
                #计算最大相似度及其Index             
                if key_num > 25 and ave > maxAve :
                    maxAve = ave 
                    maxIndex = i              
                i+=1
           except:
                print(["segment erorr:",i,self.source_seg[i]["_id"]])
                continue       
        self.max_ave = maxAve
        self.max_index = maxIndex     
        return None
def tcutword(data,stopword):
    corpora_documents = []
    for i in data.index:
        text = data.loc[i].values[0].strip()
        text = re.sub('[\"*\【\】\[\]\s*]','',text) # sub Special symbol
        text =re.sub('\([a-zA-z]+://[^\s]*\)','',text) # substitute URL
        text = re.sub('\d+\.*\d*','',text)
        text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", '',text)        
        #cuting = jieba.cut(text)
        #cuting = ' '.join(cuting)
        temp = list(jieba.cut(text,HMM=True))
        
        #temp=thu1.cut(text,text=True).split()
        word_list = temp
        '''
        word_list = []  
        for word in temp:
            if word not in stopword:  
                word_list.append(word)  
        #text = ' '.join(temp)
        '''
        corpora_documents.append(word_list)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(ttext) for ttext in corpora_documents]
    similarity = similarities.Similarity('-Similarity-index', corpus, num_features=99999999)
    return dictionary,similarity
Beispiel #16
0
def sim_eval(text, label_text):
    # label_test 为相似性模型建立的语料标签
    print('导入已训练相似性相关模型......')
    dictionary = corpora.Dictionary.load(
        '../resource/model/similarity/dict.txt')
    tfidf_model = joblib.load('../resource/model/similarity/tfidf_model.mm')
    lsi = models.LsiModel.load('../resource/model/similarity/lsi.mm')
    corpus_tfidf = joblib.load('../resource/model/similarity/corpus_tfidf.mm')

    test_raw = []
    test_raw.append(list(jieba.cut(text)))
    # print(test_raw)
    test_corpus = [dictionary.doc2bow(item) for item in test_raw]
    test_corpus_tfidf = tfidf_model[test_corpus]
    test_corpus_lsi = lsi[test_corpus_tfidf]  # 计算lsi值
    # print(test_corpus_lsi)
    corpus_lsi = lsi[corpus_tfidf]
    similaritiy_lsi = similarities.Similarity('Similarity-LSI-index',
                                              corpus_lsi,
                                              num_features=200,
                                              num_best=2)
    query = similaritiy_lsi[test_corpus_lsi][0]

    if len(query) < 1:
        query = [(0, 0), (0, 0)]
    if label_text[query[0][0]] == label_text[query[1][0]]:
        sim_estimate = label_text[query[0][0]]
    else:
        sim_estimate = 'null'
    similarity_score = str(query[0][1])[0:5]

    print('相似性判断完成......')

    return sim_estimate, similarity_score
Beispiel #17
0
    def __init__(self, cleanup_urls=True, nltk_tokenizer=False):
        super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer)
        self.corpus = []

        for bug in bugzilla.get_bugs():

            textual_features = self.text_preprocess(self.get_text(bug))
            self.corpus.append([bug["id"], textual_features])

        # Assigning unique integer ids to all words
        self.dictionary = Dictionary(text for bug_id, text in self.corpus)

        # Conversion to BoW
        corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus]

        # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
        tfidf = models.TfidfModel(corpus_final)
        corpus_tfidf = tfidf[corpus_final]

        # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
        self.lsi = models.LsiModel(
            corpus_tfidf, id2word=self.dictionary, num_topics=300
        )
        corpus_lsi = self.lsi[corpus_tfidf]

        # Indexing the corpus
        self.index = similarities.Similarity(
            output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300
        )
Beispiel #18
0
def get_sim(col_id):
    compare_all = []
    stop_words = set(stopwords.words("english"))
    stemmer = SnowballStemmer("english")
    for i in range(1, 323):
        compare = query_db(
            """Select * from countries where id = {}""".format(i))
        compare_all.append(compare[col_id])
    gen_doc_new = [[
        stemmer.stem(word) for word in document.lower().split()
        if (word not in stop_words)
    ] for document in compare_all]
    dict = gensim.corpora.Dictionary(gen_doc_new)
    corpus = [dict.doc2bow(i) for i in gen_doc_new]
    tf_idf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tf_idf[corpus]
    ## calculation of best topic sice
    #numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=65764)
    #print(numpy_matrix)
    #print("XXXXXXXXXXXXXXXXXXXXX")
    #s = np.linalg.svd(numpy_matrix, full_matrices=False, compute_uv=False)
    #print(s)
    #print("XXXXXXXXXXXXXXXXXXXXX")
    #plt.figure(figsize=(10, 5))
    #plt.hist(s[0], bins=100)
    #plt.xlabel('Singular values', fontsize=12)
    #plt.show()
    lsa = models.LsiModel(corpus_tfidf, id2word=dict, num_topics=95)
    index = similarities.Similarity(path,
                                    lsa[corpus_tfidf],
                                    num_features=len(dict))
    return index, tf_idf, dict
Beispiel #19
0
def graph_model(author_paper_list, text_sim=0.15, co_num=4):
    # 同名作者的paper id列表
    paper_id = [paper['id'] for paper in author_paper_list]
    # 构造图模型
    graph = nx.Graph()
    graph.add_nodes_from(paper_id)
    # 有共同作者的聚在一起
    for index1, p1 in enumerate(author_paper_list):
        if index1 == len(author_paper_list) - 1: break
        for index2, p2 in enumerate(author_paper_list[index1 + 1:]):
            num_co_au = num_coauthor_paper(p1, p2)
            if num_co_au >= co_num:
                graph.add_edge(p1['id'], p2['id'])
    # 有相同文章主题的聚在一起
    if len(author_paper_list) == 0:
        pass
    else:
        paper_words = get_papar_words(author_paper_list)
        dictionary = corpora.Dictionary(paper_words)
        bow_corpus = [dictionary.doc2bow(wl) for wl in paper_words]  # 语料向量化
        tfidf = models.TfidfModel(bow_corpus)  # 基于向量化的语料构建tfidf模型
        index = similarities.Similarity('E:\\gensim_test', tfidf[bow_corpus],
                                        len(dictionary))
        sim_matrix = index[tfidf[bow_corpus]]  # 计算相似性矩阵
        # 文章之间的相似度超过给定阈值的建立连接(归为一类)
        for i in range(0, sim_matrix.shape[0]):
            if i == sim_matrix.shape[0] - 1: break
            for j in range(0, sim_matrix.shape[1]):
                if j <= i: continue
                if sim_matrix[i][j] > text_sim:
                    graph.add_edge(paper_id[i], paper_id[j])
    # 计算联通子图结果
    conn_comp = list(nx.connected_components(graph))
    conn_comp = [list(c) for c in conn_comp]
    return conn_comp
Beispiel #20
0
def main():
    try:
        corpus = corpora.MmCorpus(file_tfidf + '.corpus')
        dictionary = corpora.Dictionary.load(file_tfidf + '.dict')
        modelTfidf = models.TfidfModel.load(file_tfidf + '.modelTfidf')
    except Exception:
        print 'TFIDF not found. please run buildindex.py first'

    try:
        index = cPickle.load(open(file_tfidf + '.index', 'rb'))
    except Exception:
        print 'using new index'
        modelTfidfCorpus = modelTfidf[corpus]
        index = similarities.Similarity(
            file_tfidf + '.modelTfidfindex',
            modelTfidfCorpus,
            num_features=modelTfidfCorpus.corpus.num_terms)
        index.num_best = None
        cPickle.dump(index, open(file_tfidf + '.index', 'wb'))

    # evaluate for each query
    scores = []
    for i in range(len(labels_choices)):
        print 'for label ', i
        query_vector = modelTfidf[dictionary.doc2bow([
            '<phrase>%s</phrase>' % w.lower() for w in seed_concepts_list[i]
        ])]
        scores.append(index[query_vector])

    scores = np.array(scores).T
    with open(categorization_file, 'w') as f:
        for i in range(scores.shape[0]):
            f.write('%s\n' % (' '.join(['%s' % d for d in scores[i]])))
Beispiel #21
0
def calcCorpusTFIDFSimilarity(new_ids,
                              all_ids,
                              corpus,
                              new_corpus,
                              num_feature=400):
    # calc IFTDF model
    tfidf = models.TfidfModel(corpus)
    crp_tfidf = tfidf[corpus]
    new_tfidf = tfidf[new_corpus]

    logger.info('corpus tfidf length: %i' % len(crp_tfidf))

    # create index
    index = similarities.Similarity(fp_index,
                                    crp_tfidf,
                                    num_features=num_feature,
                                    num_best=NUM_BEST_SIM_DOC)

    # similarity
    docs_sims = index[new_tfidf]
    index.save(fp_index)

    for idx, doc_sim in enumerate(docs_sims):
        cur_doc_id = new_ids[idx]
        logger.info(
            u'calc text similarity | cur_doc_id: %i | sorted_sims(0:5): %s' %
            (cur_doc_id, doc_sim[0:3]))
Beispiel #22
0
def bow():
    #        print("row", row[i])
    texts = [[word for word in jieba.cut(document, cut_all=True)]
             for document in data]

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(t) for t in texts]
    print(corpus)
    # TF-IDF特徵值
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    #在TFIDF的基础上,进行相似性检索, 然后进行similarity检索。
    similarity = similarities.Similarity('Similarity-tfidf-index',
                                         corpus_tfidf,
                                         num_features=10000)

    #    new_sensence = "昨日金融期則是下跌17.4點,指數為1,051點,跌幅1.63%。股價皆重挫3~4%,回到年線的位置。工商,2015/01/07 00:00:00,外資變臉 電子期重摔 昨日下跌8.7點,跌幅達2.35%,為各期指中最弱,1"
    #   test_corpus_1 = dictionary.doc2bow(jieba.cut(new_sensence, cut_all=True))
    #    vec_tfidf = tfidf[test_corpus_1]
    #    print(vec_tfidf)
    #    print(similarity[test_corpus_1])  # 返回最相似的样本材料,(index_of_document, similarity) tuples
    for item in corpus_tfidf:
        print(item)
    tfidf.save("data.tfidf")
    tfidf.save("data_tfidf.txt")
    tfidf.save("data_tfidf.csv")
    tfidf = models.TfidfModel.load("data.tfidf")
    print(tfidf)
Beispiel #23
0
    def calc_similarity(self, prefix: str, text: str):
        """计算相似度
        返回索引和余弦值

        Arguments:
            prefix {str} -- 模型前缀
            text {str} -- 文本数据
            value {float} -- 设定的阈值,返回大于这个值的数据
        """
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载字典
        corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix))  # 加载语料
        tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix))  # 加载Tfidf模型
        corpus_tfidf = tfidf_model[corpus]

        lsi = models.LsiModel(corpus_tfidf)
        corpus_lsi = lsi[corpus_tfidf]
        similarity_lsi = similarities.Similarity('./models/similarity-lsi-index',
                                                 corpus_lsi,
                                                 num_features=400,
                                                 num_best=3)
        cut_raw = self.segment(text)  # 1.分词
        corpus = dictionary.doc2bow(cut_raw)  # 2.转换成bow向量
        corpus_tfidf = tfidf_model[corpus]  # 3.计算tfidf值
        corpus_lsi = lsi[corpus_tfidf]  # 4.计算lsi值
        sims = similarity_lsi[corpus_lsi]

        with open('./data/idx_dic.dic', 'r') as f:
            dt = f.read()
            idx_dic = eval(dt)

        result = []
        if sims is not None:
            result = [idx_dic[idx] for idx, val in sims if val > self.keep_val]

        return result
Beispiel #24
0
def classify(k=None):
    corpora_documents = []
    f = codecs.open("result.txt", 'r', encoding="utf-8").readlines()
    f2 = codecs.open("label_level1.txt", 'r', encoding="utf-8").readlines()
    for li in f[:13000]:
        li = li.split()
        corpora_documents.append(li)

    #生成字典和向量语料
    dictionary = corpora.Dictionary(corpora_documents)
    #dictionary.save('dictionary.dict')
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    index1 = similarities.Similarity('-Similarity-index',
                                     corpus_tfidf,
                                     num_features=25000)
    #index1.save('documents.index')

    #test_data=[]
    count = 0
    for i, li in enumerate(f[13000:]):
        li = li.split()
        test_corpus_1 = dictionary.doc2bow(li)
        #k=11
        index1.num_best = k
        index = index1[tfidf[test_corpus_1]]
        print(index)
        predictions = {}
        for m, n in index:
            for key in f2[m].split():
                if key not in predictions.keys():
                    predictions[key] = 1
                else:
                    predictions[key] += 1
        true_label = f2[13000 + i].split()
        prediction = []
        for key in predictions.keys():
            if predictions[key] >= k / 2:
                prediction.append(key)
        true_label.sort()
        prediction.sort()
        print("true label:", true_label)
        if len(prediction) == 0:
            dict_sorted = sorted(predictions.items(),
                                 key=lambda x: x[1],
                                 reverse=True)
            prediction.append(dict_sorted[0][0])
            if true_label == prediction:
                count += 1
                print(1)
        elif true_label == prediction:
            count += 1
            print(1)
        else:
            print(0)
        print("predict", prediction)
        print(predictions)
        print("#####################################")
    print("count:", count)
Beispiel #25
0
    def make_index(self, seqs):
        print "building index for sequences"
        #import pdb;pdb.set_trace()
        if self.use_lsi:
            if self.use_tfidf:
                seqs = (self.lsi_model[self.tfidf_model[self.lexicon.doc2bow(
                    tokenize(seq))]] for seq in seqs)
            else:
                seqs = (self.lsi_model[self.lexicon.doc2bow(tokenize(seq))]
                        for seq in seqs)
            num_features = self.lsi_model.num_topics
        else:
            if self.use_tfidf:
                seqs = (self.tfidf_model[self.lexicon.doc2bow(tokenize(seq))]
                        for seq in seqs)
            else:
                seqs = (self.lexicon.doc2bow(tokenize(seq)) for seq in seqs)
            num_features = len(self.lexicon.keys())

        self.index = similarities.Similarity(output_prefix=self.index_filepath,
                                             corpus=None,
                                             num_features=num_features)
        self.index.save(self.index_filepath)
        self.index.add_documents(seqs)
        self.index.save(self.index_filepath)
        print "saved index to", self.index_filepath
Beispiel #26
0
    def __init__(self):
        self.corpus = []

        for bug in bugzilla.get_bugs():
            textual_features = "{} {}".format(bug["summary"],
                                              bug["comments"][0]["text"])
            textual_features = text_preprocess(textual_features)
            self.corpus.append([bug["id"], textual_features])

        # Assigning unique integer ids to all words
        self.dictionary = Dictionary(text for bug_id, text in self.corpus)

        # Conversion to BoW
        corpus_final = [
            self.dictionary.doc2bow(text) for bug_id, text in self.corpus
        ]

        # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
        tfidf = models.TfidfModel(corpus_final)
        corpus_tfidf = tfidf[corpus_final]

        # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
        self.lsi = models.LsiModel(corpus_tfidf,
                                   id2word=self.dictionary,
                                   num_topics=300)
        corpus_lsi = self.lsi[corpus_tfidf]

        # Indexing the corpus
        self.index = similarities.Similarity(output_prefix="simdata.shdat",
                                             corpus=corpus_lsi,
                                             num_features=300)
Beispiel #27
0
def d2b():
    #读取文件
    raw_documents=[]
    a=os.listdir("C:\Users\mxf\Desktop\docsim-master\\traindata")
    a.sort(key= lambda x:int(x[:-4]))
    # print a
    for name in a:
        f = open(os.path.join("C:\Users\mxf\Desktop\docsim-master\\traindata", name), 'r')
        # raw = str(os.path.join(root, name))+" "
        raw=""
        raw += f.read()
        # raw即文档内容
        raw_documents.append(raw)

    # 去除停用词
    stop = [line.strip().decode('utf-8') for line in open('stopwordd2b.txt').readlines() ]

    #创建语料库
    corpora_documents = []
    for item_text in raw_documents:
        item_str=[]
        item= (pseg.cut(item_text)) #使用jieba分词
        for i in list(item):
            item_str.append(i.word)
        item_str=a_sub_b(item_str,list(stop))
        corpora_documents.append(item_str)

    # 生成字典和向量语料
    dictionary = corpora.Dictionary(corpora_documents) #把所有单词取一个set,并对set中每一个单词分配一个id号的map
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]  #把文档doc变成一个稀疏向量,[(0,1),(1,1)]表明id为0,1的词出现了1次,其他未出现。
    similarity = similarities.Similarity('-Similarity-index11', corpus, num_features=len(dictionary))

    f=open('test_data.txt','r')
    fa=f.readlines()
    dt=dict()
    for li in fa:
        print li
        test_data_1=li.split('\n')[0].split(',')
        ind=test_data_1[0]
        test_cut = pseg.cut(test_data_1[1])
        test_cut_raw_1=[]
        for i in list(test_cut):
            test_cut_raw_1.append(i.word)
        test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)

         # 返回前101条记录,为了尽可能计算两种方法返回的TOP100的值
        similarity.num_best = 101
        print('################################')
        for i in similarity[test_corpus_1]:
            sim=""
            for j in corpora_documents[i[0]]:
                sim+=j
            ind2=i[0]+1
            print ind,i[0]+1,i[1]     #2784对应的句子所在的序号:i[0]+1 # i[1]序号对应的分值  # 如2784,2784,1.0
            if(ind==ind2):
                print "same"
            else:
                data_pro.addtodict2(dt, int(ind), int(ind2), i[1])
    return dt
 def similarity_vec_lsi(self):
     lsi = models.LsiModel(self._doc_tf_idf_vec)
     doc_lsi = lsi[self._doc_tf_idf_vec]
     similarity_vec_lsi = similarities.Similarity("Similarity-LSI-index",
                                                  corpus=doc_lsi,
                                                  num_features=len(
                                                      self._dictionary))
     return lsi, similarity_vec_lsi
Beispiel #29
0
def ge_process(raw_documents):

	corpora_documents = []  
	#分词处理  
	for item_text in raw_documents:  
	item_seg = list(jieba.cut(item_text))  
	corpora_documents.append(item_seg)  
	# 生成字典语料  
	dictionary = corpora.Dictionary(corpora_documents)

	# 词频统计,稀疏表达方式,实际上产生的是16*384的词频矩阵,16是文档数目,384是词语数目
		#dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000) 
			# 1.去掉出现次数低于no_below的 
			# 2.去掉出现次数高于no_above的。注意这个小数指的是百分数 
			# 3.在1和2的基础上,保留出现频率前keep_n的单词
	corpus = [dictionary.doc2bow(text) for text in corpora_documents]  
	# 计算TF-IDF
	tfidf_model = models.TfidfModel(corpus)  
	corpus_tfidf = tfidf_model[corpus]  
	# 计算相似度
	similarity = similarities.Similarity('Similarity-tfidf-index', corpus_tfidf, num_features=600)  
	test_data_1 = '北京雾霾红色预警'  
	test_cut_raw_1 = list(jieba.cut(test_data_1))  # ['北京', '雾', '霾', '红色', '预警']  
	test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)  # [(51, 1), (59, 1)],即在字典的56和60的地方出现重复的字段,这个值可能会变化  
	# 定义相似样本数量N
	similarity.num_best = 5  
	test_corpus_tfidf_1=tfidf_model[test_corpus_1]  # 根据之前训练生成的model,生成query的IFIDF值,然后进行相似度计算  
													# [(51, 0.7071067811865475), (59, 0.7071067811865475)]  
	print(similarity[test_corpus_tfidf_1])  # [(2, 0.3595932722091675)]

	# 利用潜在语义序列计算相似度,先获得tf-idf(也可以直接使用bow向量)
	lsi = models.LsiModel(corpus_tfidf)  
	corpus_lsi = lsi[corpus_tfidf]  
	similarity_lsi=similarities.Similarity('Similarity-LSI-index', corpus_lsi, num_features=400,num_best=2)  
	test_data_3 = '长沙街头发生砍人事件致6人死亡'  
	test_cut_raw_3 = list(jieba.cut(test_data_3))         
	test_corpus_3 = dictionary.doc2bow(test_cut_raw_3)  
	test_corpus_tfidf_3 = tfidf_model[test_corpus_3]   
	test_corpus_lsi_3 = lsi[test_corpus_tfidf_3]  
	print(similarity_lsi[test_corpus_lsi_3])    

# 基于lda的主题模型
# 词频矩阵,(395L, 4258L),表示395个文档,4258个单词 
X = lda.datasets.load_reuters()

def lda_process(X)
Beispiel #30
0
	def get_similarity(self, prepped_text):
		#take a prepped text, convert to LSI space
		vec_bow = self.dictionary.doc2bow(prepped_text)
		vec_lsi = self.model[vec_bow]
		#create the similarities, return the best 5
		index = similarities.Similarity(self.corpus, num_features=10, num_best=6)
		sims = index[vec_lsi]
		return sims