def create_documents_view(self, corpus, ir_mode):
        dictionary, pdocs = self.create_dictionary(corpus)
        bow = self.docs2bows(corpus, dictionary, pdocs)
        loaded_corpus = corpora.MmCorpus('vsm_docs.mm')  # Recover the corpus

        if ir_mode == 1:
            model = [[(w[0], 1 + np.log2(w[1])) for w in v]
                     for v in bow]  # TF model
        elif ir_mode == 2:
            model = models.TfidfModel(loaded_corpus)  # TF IDF model
        elif ir_mode == 3:
            model = models.LdaModel(loaded_corpus)  # LDA model
        elif ir_mode == 4:
            model = models.LdaMulticore(loaded_corpus)  # LDA Multicore model
        elif ir_mode == 5:
            model = models.LsiModel(loaded_corpus)  # LSI model
        elif ir_mode == 6:
            model = models.RpModel(loaded_corpus)  # RP model
        elif ir_mode == 7:
            model = models.LogEntropyModel(
                loaded_corpus)  # LogEntropyModel model

        # tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus

        return model, dictionary
Esempio n. 2
0
    def train(self, arg_fname, is_pre=True, method='lsi', **params):
        self.fname = arg_fname
        self.method = method
        self._generate_conf()
        if is_pre:
            self.docs, self.dictionary, corpus = self._preprocess()
        else:
            self.docs = pickle.load(open(self.conf['fname_docs']))
            self.dictionary = corpora.Dictionary.load(self.conf['fname_dict'])
            corpus = corpora.MmCorpus(self.conf['fname_corpus'])

        if params is None:
            params = {}

        logger.info("training TF-IDF model")
        self.tfidf = models.TfidfModel(corpus, id2word=self.dictionary)
        corpus_tfidf = self.tfidf[corpus]

        if method == 'lsi':
            logger.info("training LSI model")
            self.lsi = models.LsiModel(corpus_tfidf,
                                       id2word=self.dictionary,
                                       **params)
            self.lsi.print_topics(-1)
            self.lsi_similarity_index = similarities.MatrixSimilarity(
                self.lsi[corpus_tfidf])
            self.para = self.lsi[corpus_tfidf]
        elif method == 'lda_tfidf':
            logger.info("training LDA model")
            # try 6 workers here instead of original 8
            self.lda_tfidf = models.LdaMulticore(corpus_tfidf,
                                                 id2word=self.dictionary,
                                                 workers=6,
                                                 **params)
            self.lda_tfidf.print_topics(-1)
            self.lda_tfidf_similarity_index = similarities.MatrixSimilarity(
                self.lda[corpus_tfidf])
            self.para = self.lda[corpus_tfidf]
        elif method == 'lda':
            logger.info("training LDA model")
            # try 6 workers here instead of original 8
            self.lda = models.LdaMulticore(corpus,
                                           id2word=self.dictionary,
                                           workers=6,
                                           **params)
            self.lda.print_topics(-1)
            self.lda_similarity_index = similarities.MatrixSimilarity(
                self.lda[corpus])
            self.para = self.lda[corpus]
        elif method == 'logentropy':
            logger.info("training a log-entropy model")
            self.logent = models.LogEntropyModel(corpus,
                                                 id2word=self.dictionary)
            self.logent_similarity_index = similarities.MatrixSimilarity(
                self.logent[corpus])
            self.para = self.logent[corpus]
        else:
            msg = "unknown semantic method %s" % method
            logger.error(msg)
            raise NotImplementedError(msg)
Esempio n. 3
0
    def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[np.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s", cor)
        self.assertTrue(cor > 0.6)
Esempio n. 4
0
def query_similarity(queries, corpus, method='tfidf', n_neighbors=2):
    dictionary, corpusdic = build_corpusdic(corpus)
    if method == 'lsi':
        mdl = models.LsiModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'tfidf':
        mdl = models.TfidfModel(corpusdic)
    elif method == 'rp':
        mdl = models.RpModel(corpusdic, num_topics=100)
    elif method == 'hdp':
        mdl = models.HdpModel(corpusdic, id2word=dictionary)
    elif method == 'lda':
        mdl = models.LdaModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'lem':
        mdl = models.LogEntropyModel(corpusdic)
    elif method == 'norm':
        mdl = models.NormModel(corpus, norm='l2')

    else:
        raise ValueError("There is an invalid model method in the input!")
    index = similarities.MatrixSimilarity(mdl[corpusdic])
    indx_list = []
    sim_list = []
    for query in queries:
        vec_bow = dictionary.doc2bow(query.lower().split())
        vec_lsi = mdl[vec_bow]  # convert the query to LSI space
        sims = index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        sims = sims[:n_neighbors]
        indx_, sim_ = np.array(sims).transpose()
        indx_list.append(indx_)
        sim_list.append(sim_)
    return indx_list, sim_list
Esempio n. 5
0
def build_doc_similarity_table(corpus,
                               method='tfidf',
                               n_neighboors=3,
                               return_similarity=True,
                               batch_size=5000,
                               doc_dtype=np.int64,
                               score_dtype=np.float16):
    """
    Batch processor wrapper for ``find_similar_docs`` to
    find `n_neighboors` similar docs to all the docs
    *Note 1*: increasing ``batch_size`` can increase memory usage, but can be
    faster
    """

    # This structure should be paralleized to doc multiple CPUs

    dictionary, corpusdic = build_corpusdic(corpus)

    if method == 'lsi':
        lsi = models.LsiModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'tfidf':
        lsi = models.TfidfModel(corpusdic)
    elif method == 'rp':
        lsi = models.RpModel(corpusdic, num_topics=100)
    elif method == 'hdp':
        lsi = models.HdpModel(corpusdic, id2word=dictionary)
    elif method == 'lda':
        lsi = models.LdaModel(corpusdic, id2word=dictionary, num_topics=100)
    elif method == 'lem':
        lsi = models.LogEntropyModel(corpusdic)
    elif method == 'norm':
        lsi = models.NormModel(corpus, norm='l2')

    else:
        raise ValueError("There is an invalid model method in the input!")

    #Determing the similarities between different documents
    index = similarities.MatrixSimilarity(lsi[corpusdic])
    vec_lsi = lsi[corpusdic]
    doc_matrix = index[vec_lsi]

    doc_count = doc_matrix.shape[0]

    similarity_table = np.zeros((doc_count, n_neighboors)).astype(doc_dtype)

    similarity_score = None

    if return_similarity:
        similarity_score = np.zeros(
            (doc_count, n_neighboors)).astype(score_dtype)

    start_ = 0
    while (start_ < doc_count):

        end_ = start_ + batch_size

        if (end_ > doc_count):
            end_ = doc_count

        query_index = np.arange(start_, end_)
        query_matrix = doc_matrix[query_index]

        (similarity_table[query_index], similarities_) = \
            find_similar_docs(doc_matrix, query_matrix,
                               n_neighboors=n_neighboors,
                               method='nearest', return_similarity=return_similarity)

        if return_similarity:
            similarity_score[query_index] = similarities_

        start_ = start_ + batch_size

    return (similarity_table, similarity_score)
Esempio n. 6
0
def gen_data(train, test, no_below, no_above, num_topics=300):
    mydictionary = My_dictionary(train,
                                 test,
                                 no_below=no_below,
                                 no_above=no_above)
    train_corpus = mydictionary.train_corpus
    test_corpus = mydictionary.test_corpus
    dictionary = mydictionary.dictionary

    save_load(mode='save',
              files={
                  'train_corpus': train_corpus,
                  'test_corpus': test_corpus,
                  'dictionary': dictionary
              })
    #print(len(corpus),corpus[:10])

    #tfidf
    print('tfidf...')
    tfidf = models.TfidfModel(train_corpus)  # 第一步--初始化一个模型
    train_corpus_tfidf = tfidf[train_corpus]
    test_corpus_tfidf = tfidf[test_corpus]  #对整个语料库实施转换
    train_tfidf_array = corpustoarray(train_corpus_tfidf)
    test_tfidf_array = corpustoarray(test_corpus_tfidf)

    pd.DataFrame(train_tfidf_array).to_csv('file\\train_tfidf.csv')
    pd.DataFrame(test_tfidf_array).to_csv('file\\test_tfidf.csv')

    #lsi      200-500的num_topics维度为“金标准”
    print('lsi...')
    lsi = models.LsiModel(train_corpus_tfidf,
                          id2word=dictionary,
                          num_topics=num_topics)  # 初始化一个LSI
    train_corpus_lsi = lsi[
        train_corpus_tfidf]  # 在原始语料库上加上双重包装: bow->tfidf->fold-in-lsi
    test_corpus_lsi = lsi[test_corpus]
    train_lsi_array = corpustoarray(train_corpus_lsi)
    test_lsi_array = corpustoarray(test_corpus_lsi)
    pd.DataFrame(train_lsi_array).to_csv('file\\train_lsi.csv')
    pd.DataFrame(test_lsi_array).to_csv('file\\test_lsi.csv')

    #lsi=models.LsiModel.load('model\\model.lsi')

    #RP
    print('rp...')
    rp = models.RpModel(train_corpus_tfidf,
                        id2word=dictionary,
                        num_topics=num_topics)
    train_corpus_rp = rp[train_corpus_tfidf]
    test_corpus_rp = rp[test_corpus]
    train_rp_array = corpustoarray(train_corpus_rp)
    test_rp_array = corpustoarray(test_corpus_rp)
    pd.DataFrame(train_rp_array).to_csv('file\\train_rp.csv')
    pd.DataFrame(test_rp_array).to_csv('file\\test_rp.csv')
    '''
	#LDA   2003    LDA最早由Blei, David M.、吴恩达和Jordan, Michael I于2003年提出    有bug
	lda = models.LdaSeqModel(corpus, id2word=dictionary,num_topics=300)
	corpus_lda=lda[corpus]   
	'''
    #HDP    2011   Wang, Paisley, Blei:  http://proceedings.mlr.press/v15/wang11a/wang11a.pdf
    print('hdp...')
    hdp = models.HdpModel(train_corpus, id2word=dictionary)
    train_corpus_hdp = hdp[train_corpus]
    test_corpus_hdp = hdp[test_corpus]
    train_hdp_array = corpustoarray(train_corpus_hdp)
    test_hdp_array = corpustoarray(test_corpus_hdp)
    pd.DataFrame(train_hdp_array).to_csv('file\\train_hdp.csv')
    pd.DataFrame(test_hdp_array).to_csv('file\\test_hdp.csv')

    #Log Entropy Model   2015
    print('log...')
    log = models.LogEntropyModel(train_corpus)
    train_corpus_log = log[train_corpus]
    test_corpus_log = log[test_corpus]
    train_log_array = corpustoarray(train_corpus_log)
    test_log_array = corpustoarray(test_corpus_log)
    pd.DataFrame(train_log_array).to_csv('file\\train_log.csv')
    pd.DataFrame(test_log_array).to_csv('file\\test_log.csv')