Exemple #1
0
 def testPersistence(self):
     model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
     model.save(testfile())
     model2 = tfidfmodel.TfidfModel.load(testfile())
     self.assertTrue(model.idfs == model2.idfs)
     tstvec = []
     self.assertTrue(numpy.allclose(
         model[tstvec], model2[tstvec]))  # try projecting an empty vector
Exemple #2
0
 def testPersistenceCompressed(self):
     fname = testfile() + '.gz'
     model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
     model.save(fname)
     model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
     self.assertTrue(model.idfs == model2.idfs)
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
    def test_init(self):
        # create the transformation model by analyzing a corpus
        # uses the global `corpus`!
        model1 = tfidfmodel.TfidfModel(common_corpus)
        dfs = common_dictionary.dfs

        # make sure the dfs<->idfs transformation works
        self.assertEqual(model1.dfs, dfs)
        self.assertEqual(
            model1.idfs,
            tfidfmodel.precompute_idfs(model1.wglobal, dfs,
                                       len(common_corpus)))

        # create the transformation model by directly supplying a term->docfreq
        # mapping from the global var `dictionary`.
        model2 = tfidfmodel.TfidfModel(dictionary=common_dictionary)
        self.assertEqual(model1.idfs, model2.idfs)
Exemple #4
0
    def testTransform(self):
        # create the transformation model
        model = tfidfmodel.TfidfModel(self.corpus, normalize=True)

        # transform one document
        doc = list(self.corpus)[0]
        transformed = model[doc]

        expected =  [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
        self.assertTrue(numpy.allclose(transformed, expected))
Exemple #5
0
    def analyse(self, texts):
        dictionary = corpora.Dictionary(texts)

        corpus = [dictionary.doc2bow(text) for text in texts]

        tfidf_model = tfidfmodel.TfidfModel(corpus, normalize=True)
        tfidf_corpus = tfidf_model[corpus]

        lsi_model = models.LsiModel(tfidf_corpus,
                                    id2word=dictionary, num_topics=400)

        return Analysed(dictionary, tfidf_corpus, lsi_model)
Exemple #6
0
 def tfidfmodel(self, bowlist=None, save=False, savename=None):
     """
     tf-idf是tf和idf两个参数的乘积,tf(词频)表示某个词在一句话中出现的频率,如果一个词在
     一句话中多次出现,那么tf会是一个较大值
     idf(倒文档率)表示包含某个词的语句在整个语料库中所占比率倒数之对数值,如果一个词广泛出现在语料中,那么它的idf值会较小
     如果一个词在一句话中多次出现,但在整个语料中出现的不多,那么它一定对这句话很重要,所以它的tf-idf会较大
     build a tfidfmodel
     :param bowlist: the list of bag of words
     :param save: save the model
     :param savename: the file name of model
     :return: the model itself
     """
     print('using Tfidfmodel')
     tfidf = tfidfmodel.TfidfModel(bowlist)
     if save:
         print('输出TF-IDF模型到文件:{}'.format(savename))
         tfidf.save(savename)
     return tfidf
Exemple #7
0
def vectorize(ori_data, train_file_path, test_file_path):
    train_data = ori_data['train']
    corpus = []
    for label, data in train_data:
        corpus.append(data)
    dic = dictionary.Dictionary(corpus)
    corpus = [dic.doc2bow(doc) for doc in corpus]
    tfidf = tfidfmodel.TfidfModel(corpus)
    #training data
    fp = open(train_file_path, 'w')
    for label, data in train_data:
        vec = tfidf[dic.doc2bow(data)]
        fp.write("%s %s\n" % (label, sparse(vec)))
    fp.close()
    fp = open(test_file_path, 'w')
    for label, data in ori_data['test']:
        vec = tfidf[dic.doc2bow(data)]
        fp.write("%s %s\n" % (label, sparse(vec)))
    fp.close()
Exemple #8
0
    def _update_keywords(self):
        """
        For each topic in the corpus, generate using TFIDF a list of
        n_keywords most importance keywords in the form of token-to-weight
        mapping
        :param n_keywords: number of keywords to store for each topic
        """
        corpus_bow = [
            self.dictionary.doc2bow(data['body'])
            for data in self.data.values()
        ]
        tfidf = tfidfmodel.TfidfModel(corpus_bow, smartirs=self.tfidf_scheme)

        for tid, data in self.data.items():
            weights = tfidf[self.dictionary.doc2bow(data['body'])]
            weights.sort(key=lambda x: x[1], reverse=True)
            # generate token-to-weight mapping instead of id-to-weight mapping
            data['keywords'] = {
                self.dictionary[wid]: weight
                for wid, weight in weights[:self.num_keywords]
            }
    def test_wlocal_wglobal(self):
        def wlocal(tf):
            assert isinstance(tf, np.ndarray)
            return iter(tf + 1)

        def wglobal(df, total_docs):
            return 1

        docs = [corpus[1], corpus[2]]
        model = tfidfmodel.TfidfModel(corpus,
                                      wlocal=wlocal,
                                      wglobal=wglobal,
                                      normalize=False)
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [
            [(termid, weight + 1) for termid, weight in docs[0]],
            [(termid, weight + 1) for termid, weight in docs[1]],
        ]

        self.assertTrue(
            np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0])))
        self.assertTrue(
            np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
Exemple #10
0
 def testPersistence(self):
     model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
     model.save(testfile())
     model2 = tfidfmodel.TfidfModel.load(testfile())
     self.assertTrue(model.idfs == model2.idfs)
Exemple #11
0
def tfidf_calc(doc_corpus, num_features):
    tfidf_model = tfidfmodel.TfidfModel(doc_corpus)
    tfidf = tfidf_model[doc_corpus]
    index = docsim.Similarity('', tfidf, num_features=num_features)
    return tfidf, index[tfidf]
Exemple #12
0
def sim_update(results):
    """
    Update Models.
    :param results:
    :return:
    """

    shutil.rmtree(lsitemp, ignore_errors=True)
    mkdir(lsitemp)

    t_total_begin = time.time()

    # print("Checking repeat ...")
    # results_temp = check_repet_new(results)
    results_temp = results
    # print("Check repeat complete!")
    print("Prefix mapping ...")
    results = prefix_map(results_temp)
    print("Prefix map complete!")
    del results_temp

    print("Building LSI model ...")

    # Extended Dictionary
    dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
    # Load Models
    corpus_raw = corpora.MmCorpus(lsipath + 'viva.mm')
    lsi = lsimodel.LsiModel.load(lsipath +
                                 'viva.lsi')  # 将 mm 文件中的 corpus 映射到 LSI 空间当中

    mkdir(news_post_add)

    # Preporcessing text. Get corpus_add.
    for postfile in results:
        deltags = stripTags(postfile['text'])
        text_del = delstopwords("".join(deltags.split()))
        # text_vec = jieba.lcut(text_del)
        # del and
        with open(news_post_add + postfile['name'], 'w') as fp:
            fp.write(text_del)

    files = os.listdir(news_post_add)
    for i in files:
        shutil.copy(news_post_add + i, docpath)

    from dict_stream_train import getDictionary
    dict2 = getDictionary(lsipath=lsitemp, docpath=news_post_add)
    dict2 = corpora.Dictionary.load(lsitemp + 'viva.dict')

    from corpus_stream_train import getCorpus
    corpus2 = getCorpus(lsipath=lsitemp, docpath=news_post_add)
    corpus2 = corpora.MmCorpus(lsitemp + 'viva.mm')

    dict2_to_dict1 = dictionary.merge_with(dict2)
    # dict2_to_dict1.save(lsipath + 'viva2.dict')
    # dict2_to_dict1 = corpora.Dictionary.load(lsipath + 'viva2.dict')

    merged_corpus = itertools.chain(corpus_raw, dict2_to_dict1[corpus2])
    corpora.MmCorpus.serialize(lsipath + 'viva.mm', [i for i in merged_corpus])
    merged_corpus = corpora.MmCorpus(lsipath + 'viva.mm')

    # Get TF-IDF vecters of documents
    tfidf = tfidfmodel.TfidfModel(merged_corpus)
    print("Building tfidf model ...")
    corpus_tfidf = tfidf[merged_corpus]
    print("Building corpus_tfidf model ...")
    # Updated LSI Model

    # lsi.add_documents(corpus_tfidf, chunksize=chunksize, decay=DECAY_FACTOR)
    # # lsi.add_documents(corpus_tfidf, chunksize=chunksize)
    #
    # print("Builded lsi add documents to model ...")
    # # Updated Corpus
    # if not os.path.exists(lsipath):
    #     os.mkdir(lsipath)
    # # corpus = corpora.MmCorpus.serialize(lsipath + 'viva.mm', itertools.chain(corpus_raw, corpus2))

    lsi = lsimodel.LsiModel(corpus_tfidf,
                            id2word=dictionary,
                            num_topics=NUM_TOPIC,
                            chunksize=chunksize,
                            power_iters=2,
                            onepass=True)  # 其他参数都是默认

    lsi.save(lsipath + 'viva.lsi')
    lsi = models.lsimodel.LsiModel.load(lsipath + 'viva.lsi')
    index = similarities.docsim.Similarity(lsipath + 'viva.index',
                                           lsi[merged_corpus],
                                           num_features=NUM_TOPIC)
    # Save Models

    index.save(lsipath + 'viva.index')
    print("LSI model saved!")

    # Print elasped time
    t2 = time.time()
    print "Total elapsed time is: ", t2 - t_total_begin, "s"
        print globals()['__doc__'] % locals()
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()
    
    logging.info("loading corpus mappings")
    config = dmlcorpus.DmlConfig('gensim_%s' % language, resultDir = gensim_build.RESULT_DIR, acceptLangs = [language])

    logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
    id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))
    
    corpus = MmCorpus(config.resultFile('bow.mm'))

    if method == 'tfidf':
        model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        model.save(config.resultFile('model_tfidf.pkl'))
    elif method == 'lda':
        model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
        model.save(config.resultFile('model_lda.pkl'))
    elif method == 'lsi':
        # first, transform word counts to tf-idf weights
        tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        # then find the transformation from tf-idf to latent space
        model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI)
        model.save(config.resultFile('model_lsi.pkl'))
    elif method == 'rp':
        # first, transform word counts to tf-idf weights
        tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        # then find the transformation from tf-idf to latent space
        model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP)
    def test_consistency(self):
        docs = [corpus[1], corpus[2]]

        # Test if `ntc` yields the default docs.
        model = tfidfmodel.TfidfModel(corpus, smartirs='nfc')
        transformed_docs = [model[docs[0]], model[docs[1]]]

        model = tfidfmodel.TfidfModel(corpus)
        expected_docs = [model[docs[0]], model[docs[1]]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Testing all the variations of `wlocal`
        # tnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='tnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = docs[:]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # nnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='nnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = docs[:]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # lnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='lnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0),
                          (8, 1.0)], [(5, 2.0), (9, 1.0), (10, 1.0)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # dnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='dnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0),
                          (8, 1.0)], [(5, 2.0), (9, 1.0), (10, 1.0)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # ann
        model = tfidfmodel.TfidfModel(corpus, smartirs='ann')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0),
                          (8, 1.0)], [(5, 1.0), (9, 0.75), (10, 0.75)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # bnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='bnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
                         [(5, 1), (9, 1), (10, 1)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Lnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='Lnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0),
                          (8, 1.0)],
                         [(5, 1.4133901052), (9, 0.7066950526),
                          (10, 0.7066950526)]]

        # Testing all the variations of `glocal`
        # nxn
        model = tfidfmodel.TfidfModel(corpus, smartirs='nxn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = docs[:]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # nfn
        model = tfidfmodel.TfidfModel(corpus, smartirs='nfn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 3.169925001442312), (4, 3.169925001442312),
                          (5, 1.584962500721156), (6, 3.169925001442312),
                          (7, 3.169925001442312), (8, 2.169925001442312)],
                         [(5, 3.169925001442312), (9, 3.169925001442312),
                          (10, 3.169925001442312)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # ntn
        model = tfidfmodel.TfidfModel(corpus, smartirs='ntn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 3.321928094887362), (4, 3.321928094887362),
                          (5, 1.736965594166206), (6, 3.321928094887362),
                          (7, 3.321928094887362), (8, 2.321928094887362)],
                         [(5, 3.473931188332412), (9, 3.321928094887362),
                          (10, 3.321928094887362)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # npn
        model = tfidfmodel.TfidfModel(corpus, smartirs='npn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 3.0), (4, 3.0), (5, 1.0), (6, 3.0), (7, 3.0),
                          (8, 1.8073549220576042)],
                         [(5, 2.0), (9, 3.0), (10, 3.0)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Testing all the variations of `normalize`
        # nnx
        model = tfidfmodel.TfidfModel(corpus, smartirs='nnx')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = docs[:]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # nnc
        model = tfidfmodel.TfidfModel(corpus, smartirs='nnc')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 0.4082482905), (4, 0.4082482905),
                          (5, 0.4082482905), (6, 0.4082482905),
                          (7, 0.4082482905), (8, 0.4082482905)],
                         [(5, 0.81649658092772603), (9, 0.40824829046386302),
                          (10, 0.40824829046386302)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        model = tfidfmodel.TfidfModel(corpus,
                                      wlocal=lambda x: x,
                                      wglobal=lambda x, y: x * x,
                                      smartirs='nnc')

        transformed_docs = [model[docs[0]], model[docs[1]]]

        model = tfidfmodel.TfidfModel(corpus,
                                      wlocal=lambda x: x * x,
                                      wglobal=lambda x, y: x,
                                      smartirs='nnc')
        expected_docs = [model[docs[0]], model[docs[1]]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # nnu
        slope = 0.2
        model = tfidfmodel.TfidfModel(corpus, smartirs='nnu', slope=slope)
        transformed_docs = [model[docs[0]], model[docs[1]]]
        average_unique_length = 1.0 * sum(len(set(text))
                                          for text in texts) / len(texts)
        vector_norms = [
            (1.0 - slope) * average_unique_length + slope * 6.0,
            (1.0 - slope) * average_unique_length + slope * 3.0,
        ]
        expected_docs = [
            [(termid, weight / vector_norms[0]) for termid, weight in docs[0]],
            [(termid, weight / vector_norms[1]) for termid, weight in docs[1]],
        ]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # nnb
        slope = 0.2
        model = tfidfmodel.TfidfModel(dictionary=dictionary,
                                      smartirs='nnb',
                                      slope=slope)
        transformed_docs = [model[docs[0]], model[docs[1]]]
        average_character_length = sum(
            len(word) + 1.0 for text in texts for word in text) / len(texts)
        vector_norms = [
            (1.0 - slope) * average_character_length + slope * 36.0,
            (1.0 - slope) * average_character_length + slope * 25.0,
        ]
        expected_docs = [
            [(termid, weight / vector_norms[0]) for termid, weight in docs[0]],
            [(termid, weight / vector_norms[1]) for termid, weight in docs[1]],
        ]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
Exemple #15
0
from gensim.utils import to_unicode
from gensim.interfaces import TransformedCorpus
from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
                            ucicorpus, malletcorpus, textcorpus, indexedcorpus, dictionary)
from gensim.models import (tfidfmodel,word2vec,ldamodel)

print 'start'
train_set=[]
for line in open('articles.txt'):
	items = line.strip().split('\t', 1)
	if len(items) < 2:
		continue
	words = items[1].strip().split(' ')
	train_set.append(words)

print 'construct dict'
dic = dictionary.Dictionary(train_set)
print 'doc2bow'
corpus = [dic.doc2bow(text) for text in train_set]
print 'ifidf'
tfidf = tfidfmodel.TfidfModel(corpus)
print 'ifidf corpus'
corpus_tfidf = tfidf[corpus]
print 'lda model'
lda = ldamodel.LdaModel(corpus_tfidf, id2word = dic, num_topics = 1000,  iterations = 1300, alpha = 0.15, eta = 0.01)
print 'corpus_tfidf'
corpus_lda = lda[corpus_tfidf]

lda.save('lda_model')

    def test_consistency(self):
        docs = [corpus[1], corpus[2]]

        # Test if `ntc` yields the default docs.
        model = tfidfmodel.TfidfModel(corpus, smartirs='ntc')
        transformed_docs = [model[docs[0]], model[docs[1]]]

        model = tfidfmodel.TfidfModel(corpus)
        expected_docs = [model[docs[0]], model[docs[1]]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Testing all the variations of `wlocal`
        # nnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='nnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = docs[:]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # lnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='lnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0),
                          (8, 1.0)], [(5, 2.0), (9, 1.0), (10, 1.0)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # ann
        model = tfidfmodel.TfidfModel(corpus, smartirs='ann')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0),
                          (8, 1.0)], [(5, 1.0), (9, 0.75), (10, 0.75)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # bnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='bnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
                         [(5, 1), (9, 1), (10, 1)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Lnn
        model = tfidfmodel.TfidfModel(corpus, smartirs='Lnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0),
                          (8, 1.0)],
                         [(5, 1.4133901052), (9, 0.7066950526),
                          (10, 0.7066950526)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Testing all the variations of `glocal`
        # ntn
        model = tfidfmodel.TfidfModel(corpus, smartirs='ntn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 3.169925001442312), (4, 3.169925001442312),
                          (5, 1.584962500721156), (6, 3.169925001442312),
                          (7, 3.169925001442312), (8, 2.169925001442312)],
                         [(5, 3.169925001442312), (9, 3.169925001442312),
                          (10, 3.169925001442312)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # npn
        model = tfidfmodel.TfidfModel(corpus, smartirs='npn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 3.0), (4, 3.0), (5, 1.0), (6, 3.0), (7, 3.0),
                          (8, 1.8073549220576042)],
                         [(5, 2.0), (9, 3.0), (10, 3.0)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Testing all the variations of `normalize`
        # nnc
        model = tfidfmodel.TfidfModel(corpus, smartirs='nnc')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 0.4082482905), (4, 0.4082482905),
                          (5, 0.4082482905), (6, 0.4082482905),
                          (7, 0.4082482905), (8, 0.4082482905)],
                         [(5, 0.81649658092772603), (9, 0.40824829046386302),
                          (10, 0.40824829046386302)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        model = tfidfmodel.TfidfModel(corpus,
                                      wlocal=lambda x: x,
                                      wglobal=lambda x, y: x * x,
                                      smartirs='nnc')

        transformed_docs = [model[docs[0]], model[docs[1]]]

        model = tfidfmodel.TfidfModel(corpus,
                                      wlocal=lambda x: x * x,
                                      wglobal=lambda x, y: x,
                                      smartirs='nnc')
        expected_docs = [model[docs[0]], model[docs[1]]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
Exemple #17
0
]
once_ids = [
    tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
    if docfreq < lower or docfreq > upper
]
dictionary.filter_tokens(
    stop_ids)  # remove stop words and words that appear only once
dictionary.compactify(
)  # remove gaps in id sequence after words that were removed
print "taille du dictionnaire après diminution: %d" % len(dictionary)

## PROJECTION
texts = [[word for word in doc.split(" ")] for doc in reviews]
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = tf.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]
vecteurs = mtutils.corpus2csc(tfidf_corpus,
                              num_terms=len(dictionary),
                              num_docs=len(labels))
vecteurs = vecteurs.T

res = feat.chi2(vecteurs, labels)

ids = [index for index in np.where(res == min(res[0]))[1]]
dictionary.filter_tokens(ids)
dictionary.compactify()
print "taille du dictionnaire après Chi2 diminution: %d" % len(dictionary)

## PROJECTION
texts = [[word for word in doc.split(" ")] for doc in reviews]
Exemple #18
0
def getTfidfLsiSims(corpus, confId, confIdtoIndex, dictionary, outputDir):
    print(
        "Using gensim to get TFIDF vector and LSI vector for conferences in corpus "
    )
    #tfidf
    tfidf = tfidfmodel.TfidfModel(
        corpus)  # initialize a tfidf transformation for corpus
    corpus_tfidf = tfidf[corpus]  # get tfidf vectors
    #lsi
    lsi = lsimodel.LsiModel(
        corpus_tfidf, id2word=dictionary, num_topics=4
    )  # initialize an LSI transformation for corpus, with number of topics = 4
    corpus_lsi = lsi[corpus_tfidf]

    ####### not important, just printing
    print("Printing TF-IDF vectors in " + outputDir + '/conffTFIDF.txt')
    fTFIDFFile = open(outputDir + '/conffTFIDF.txt', 'w')
    j = 0
    for doc in corpus_tfidf:
        print >> fTFIDFFile, confId[j], doc
        j = j + 1
        if j % 100 == 0:
            print(j)
    tfidf.save(outputDir + '/conftfidf.mod')

    #print "length of corpus is",len(corpus)

    printvectors = False
    if printvectors == True:
        i = 0
        for doc in corpus_tfidf:
            print("tfidf doc", confId[i], doc)
            i += 1

        i = 0
        for doc in corpus_lsi:
            print("lsi doc", confId[i], doc)
            i += 1
    ####### not important

    #compute similarity of corpus against itself
    listofMethods = ['corpus_lsi', 'corpus_tfidf']
    for method in listofMethods:
        if method == 'corpus_lsi':
            cor = corpus_lsi
        elif method == 'corpus_tfidf':
            cor = corpus_tfidf

        index = similarities.MatrixSimilarity(cor)
        confSims = dict()
        confSimsDict = dict()  # dictionary of [confId1][confId2]
        j = 0
        sims = []
        for vec_tfidf in cor:
            sims = index[vec_tfidf]
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            confSims[confId[j]] = sims  # in khat be dard nemikhore
            confSimsDict[j] = dict(sims)
            #print "index: ",confIdtoIndex[confId[j]], "confId: ", confId[j], confSims[confId[j]]
            j += 1

        if method == 'corpus_lsi':
            cslsi = dict()
            for c1index in confSimsDict.keys():
                cslsi[confId[c1index]] = dict()
                for c2index in confSimsDict.keys():
                    cslsi[confId[c1index]][
                        confId[c2index]] = confSimsDict[c1index][c2index]

        elif method == 'corpus_tfidf':
            cstfidf = dict()
            for c1index in confSimsDict.keys():
                cstfidf[confId[c1index]] = dict()
                for c2index in confSimsDict.keys():
                    cstfidf[confId[c1index]][
                        confId[c2index]] = confSimsDict[c1index][c2index]

    return cstfidf, cslsi
Exemple #19
0
    def TestConsistency(self):
        docs = [corpus[1], corpus[2]]

        # Test if `ntc` yields the default docs.
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc')
        transformed_docs = [model[docs[0]], model[docs[1]]]

        model = tfidfmodel.TfidfModel(self.corpus)
        expected_docs = [model[docs[0]], model[docs[1]]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Testing all the variations of `wlocal`
        # nnn
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
                         [(5, 6), (9, 3), (10, 3)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # lnn
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0),
                          (8, 2.0)], [(5, 6.0), (9, 3.0), (10, 3.0)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # ann
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0),
                          (8, 2.0)], [(5, 3.0), (9, 2.25), (10, 2.25)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # bnn
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
                         [(5, 3), (9, 3), (10, 3)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Lnn
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.4635792826230198), (4, 1.4635792826230198),
                          (5, 2.19536892393453), (6, 1.4635792826230198),
                          (7, 2.19536892393453), (8, 1.4635792826230198)],
                         [(5, 3.627141918134611), (9, 1.8135709590673055),
                          (10, 1.8135709590673055)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Testing all the variations of `glocal`
        # ntn
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 2.1699250014423126), (4, 2.1699250014423126),
                          (5, 1.5849625007211563), (6, 2.1699250014423126),
                          (7, 1.5849625007211563), (8, 2.1699250014423126)],
                         [(5, 3.1699250014423126), (9, 1.5849625007211563),
                          (10, 1.5849625007211563)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # npn
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 1.8073549220576042), (4, 1.8073549220576042),
                          (5, 1.0), (6, 1.8073549220576042), (7, 1.0),
                          (8, 1.8073549220576042)],
                         [(5, 2.0), (9, 1.0), (10, 1.0)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        # Testing all the variations of `normalize`
        # nnc
        model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc')
        transformed_docs = [model[docs[0]], model[docs[1]]]
        expected_docs = [[(3, 0.34299717028501764), (4, 0.34299717028501764),
                          (5, 0.51449575542752646), (6, 0.34299717028501764),
                          (7, 0.51449575542752646), (8, 0.34299717028501764)],
                         [(5, 0.81649658092772603), (9, 0.40824829046386302),
                          (10, 0.40824829046386302)]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

        model = tfidfmodel.TfidfModel(self.corpus,
                                      wlocal=lambda x: x,
                                      wglobal=lambda x, y: x * x,
                                      smartirs='nnc')

        transformed_docs = [model[docs[0]], model[docs[1]]]

        model = tfidfmodel.TfidfModel(self.corpus,
                                      wlocal=lambda x: x * x,
                                      wglobal=lambda x, y: x,
                                      smartirs='nnc')
        expected_docs = [model[docs[0]], model[docs[1]]]

        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
Exemple #20
0
def tfidf_bow(corpus_bow):
    tfidf = tfidfmodel.TfidfModel(corpus_bow)
    corpus_tfidf = tfidf[corpus_bow]
    return corpus_tfidf