Ejemplo n.º 1
0
    def testOnlineTransform(self):
        corpus = list(self.corpus)
        doc = corpus[0]  # use the corpus' first document for testing

        # create the transformation model
        model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5)  # compute everything at once
        model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5)  # start with no documents, we will add them later

        # train model on a single document
        model.add_documents([corpus[0]])

        # transform the testing document with this partial transformation
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, model.num_topics)  # convert to dense vector, for easier equality tests
        expected = np.array([-1.73205078, 0.0, 0.0, 0.0, 0.0])  # scaled LSI version
        self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6))  # transformed entries must be equal up to sign

        # train on another 4 documents
        model.add_documents(corpus[1:5], chunksize=2)  # train on 4 extra docs, in chunks of 2 documents, for the lols

        # transform a document with this partial transformation
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, model.num_topics)  # convert to dense vector, for easier equality tests
        expected = np.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269])  # scaled LSI version
        self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6))  # transformed entries must be equal up to sign

        # train on the rest of documents
        model.add_documents(corpus[5:])

        # make sure the final transformation is the same as if we had decomposed the whole corpus at once
        vec1 = matutils.sparse2full(model[doc], model.num_topics)
        vec2 = matutils.sparse2full(model2[doc], model2.num_topics)
        self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5))  # the two LSI representations must equal up to sign
 def testPersistence(self):
     model = lsimodel.LsiModel(self.corpus, numTopics = 2)
     model.save(testfile())
     model2 = lsimodel.LsiModel.load(testfile())
     self.assertEqual(model.numTopics, model2.numTopics)
     self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
     self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
Ejemplo n.º 3
0
def getLsiModel(lsipath='./lsi/', num_topics=300):
    # 加载字典
    dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
    print '字典加载完毕'
    # 语料库
    corpus = corpora.MmCorpus(lsipath +'viva.mm')
    print ('mm load')

    t31 = time.time()

    # tfidf
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    t32 = time.time()
    print "tfidf_corpus time = ", t32 - t31

    # baobao change 3 lines
    # corpus = MyCorpus()
    # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False)
    # lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000)
    lsi = None
    try:
         lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True)  #其他参数都是默认
         lsi.save(lsipath  + 'viva.lsi')
         print('lsi模型保存完毕')
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception, e:
        logging.error('Failed to lsi train', exc_info=True)
Ejemplo n.º 4
0
    def testLargeMmap(self):
        model = lsimodel.LsiModel(self.corpus, num_topics=2)

        # test storing the internal arrays into separate files
        model.save(testfile(), sep_limit=0)

        model2 = lsimodel.LsiModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.projection.u,
                                       model2.projection.u))
        self.assertTrue(numpy.allclose(model.projection.s,
                                       model2.projection.s))
        tstvec = []
        self.assertTrue(numpy.allclose(
            model[tstvec], model2[tstvec]))  # try projecting an empty vector

        # now load the external arrays via mmap
        model2 = lsimodel.LsiModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.projection.u,
                                       model2.projection.u))
        self.assertTrue(numpy.allclose(model.projection.s,
                                       model2.projection.s))
        tstvec = []
        self.assertTrue(numpy.allclose(
            model[tstvec], model2[tstvec]))  # try projecting an empty vector
Ejemplo n.º 5
0
 def generateTopic(self,wordsLists, method=TopicMethod.LSI, numTopics=25):
     """step4: 主题向量转换"""
     """Note:
            采用LDA转换后,经文本相似度比较后发现效果一点都不好,
            故而采用LSI转换,效果不错.
                             Created by flx on 2018-4-7
     """
     bowCorpus = self.generateBow(wordsLists)
     tfidfCorpus = self.generateTfidf(bowCorpus)
     if method == TopicMethod.LDA:
         instance = ldamodel.LdaModel(tfidfCorpus, id2word=self.dictionary, num_topics=numTopics)
         CacheUtil.dumpTopicModel(instance)
     elif method == TopicMethod.LSI:
         instance = lsimodel.LsiModel(tfidfCorpus, id2word=self.dictionary, num_topics=numTopics)
         CacheUtil.dumpTopicModel(instance)
     dstCorpus = instance[tfidfCorpus]
     features=[]
     # gensim转换后的格式是tuple列表格式,如:
     #   vec = [(0, 0.12345), (2,0.458124),(4,0.485263),(7,0.589542)...]
     # 只保存向量中的非零值
     # 我们转换为普通向量形式
     for doc in dstCorpus:
         vector=[0]*numTopics
         for pair in doc:
             vector[pair[0]] = pair[1]
         features.append(vector)
     return features
Ejemplo n.º 6
0
 def initialize(self, myid, dispatcher, **model_params):
     self.lock_update = threading.Lock()
     self.jobsdone = 0 # how many jobs has this worker completed?
     self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
     self.dispatcher = dispatcher
     logger.info("initializing worker #%s" % myid)
     self.model = lsimodel.LsiModel(**model_params)
Ejemplo n.º 7
0
    def finalize(self):
        if self.model_exist:
            return

        if self.num_of_scans == 1:
            print "Loaded the model from file."
        else:
            print "Performing SVD..."

            # svd = SVD(n_components=self.num_of_features, random_state=42)
            # x = svd.fit_transform(self.vectors)
            # self.vectors = x

            x = Sparse2Corpus(self.vectors)
            lsi = lsimodel.LsiModel(corpus=x,
                                    id2word=None,
                                    num_topics=self.num_of_features)
            lsi.save(self.model_file_name)
            self.vectors = lsi.projection.u

            print "done."

        if self.n <= 1:
            self.n = 2.0
        self.mean = self.sum / self.n
        self.var = (self.sum_sq -
                    (self.sum * self.sum) / self.n) / (self.n - 1)
        self.var = math.sqrt(self.var)

        f = open(self.stat_filename, 'a')
        lang_pair = self.src_language + self.trg_language
        f.write("\n" + lang_pair + "\n")
        f.write("stats\t" + str(self.mean) + "\t" + str(self.var) + "\n")
        f.close()
Ejemplo n.º 8
0
 def testPersistence(self):
     model = lsimodel.LsiModel(self.corpus, num_topics=2)
     model.save(testfile())
     model2 = lsimodel.LsiModel.load(testfile())
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
     self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
Ejemplo n.º 9
0
 def testPersistenceCompressed(self):
     fname = testfile() + '.gz'
     model = lsimodel.LsiModel(self.corpus, num_topics=2)
     model.save(fname)
     model2 = lsimodel.LsiModel.load(fname, mmap=None)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
     self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
Ejemplo n.º 10
0
 def testTransform(self):
     # create the transformation model
     model = lsimodel.LsiModel(self.corpus, numTopics = 2)
     
     # transform one document
     doc = list(self.corpus)[0]
     transformed = model[doc]
     
     vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests
     expected = [0.1973928, 0.05591352]
     self.assertTrue(numpy.allclose(abs(vec), expected)) # transformed entries must be equal up to sign
Ejemplo n.º 11
0
    def testLargeMmapCompressed(self):
        fname = testfile() + '.gz'
        model = lsimodel.LsiModel(self.corpus, num_topics=2)

        # test storing the internal arrays into separate files
        model.save(fname, sep_limit=0)

        # now load the external arrays via mmap
        return

        # turns out this test doesn't exercise this because there are no arrays
        # to be mmaped!
        self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r')
Ejemplo n.º 12
0
 def testTransform(self):
     # create the transformation model
     model = lsimodel.LsiModel(self.corpus, numTopics = 2)
     
     # transform one document
     doc = list(self.corpus)[0]
     transformed = model[doc]
     vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
     
     expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
     # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
     
     self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Ejemplo n.º 13
0
 def testCorpusTransform(self):
     """Test lsi[corpus] transformation."""
     model = lsimodel.LsiModel(self.corpus, num_topics=2)
     got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[corpus])
     expected = numpy.array([
         [ 0.65946639,  0.14211544],
         [ 2.02454305, -0.42088759],
         [ 1.54655361,  0.32358921],
         [ 1.81114125,  0.5890525 ],
         [ 0.9336738 , -0.27138939],
         [ 0.01274618, -0.49016181],
         [ 0.04888203, -1.11294699],
         [ 0.08063836, -1.56345594],
         [ 0.27381003, -1.34694159]])
     self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign
Ejemplo n.º 14
0
    def testTransform(self):
        """Test lsi[vector] transformation."""
        # create the transformation model
        model = lsimodel.LsiModel(self.corpus, num_topics=2)

        # make sure the decomposition is enough accurate
        u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
        self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match

        # transform one document
        doc = list(self.corpus)[0]
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
        expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
        # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
        self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Ejemplo n.º 15
0
def lsi(corpus, num_topics, tfidf=False):
    dictionary = corpora.Dictionary(corpus)
    corpus = [dictionary.doc2bow(text) for text in corpus]
    if tfidf:
        corpus = tfidf_bow(corpus)
    lsi_model = lsimodel.LsiModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
    result = lsi_model[corpus]
    topics = {topic: [] for topic in range(num_topics)}
    for i in range(len(corpus)):
        t = dict(result[i])
        if len(t) > 0:
            max_topic = list(t.keys())[0]
            for topic_no, value in t.items():
                if t[max_topic] < value:
                    max_topic = topic_no
            topics[max_topic].append(i)
    return topics
Ejemplo n.º 16
0
    def create_model(self):
        if not os.path.isfile(self.model_file):
            if self.model_name == 'LSI':
                self.model = lsimodel.LsiModel(corpus = self.corpus, \
                        id2word = self.dictionary, num_topics = self.num_topics)
            else:
                self.model = ldamodel.LdaModel(corpus = self.corpus, \
                        num_topics = self.num_topics, id2word = self.dictionary)
            self.model.save(self.model_file)

            self.corpora = self.model[self.corpus]
            corpora.MmCorpus.serialize(self.corpora_file, self.corpora)
        else:
            self.corpora = gensim.corpora.MmCorpus(self.corpora_file)
            if self.model_name == 'LSI':
                self.model = gensim.models.LsiModel.load(self.model_file)
            else:
                self.model = gensim.models.LdaModel.load(self.model_file)
def lda_keyWords(cat):
    df = pd.read_csv('train_set.csv', encoding="utf_8_sig")
    text = []
    for i in range(len(df)):
        if df['label'][i] == cat:
            text += splitWords(df['content'][i])
    text = [text]
    dictionary = corpora.Dictionary(text)
    corpus = [dictionary.doc2bow(t) for t in text]

    #print(cat + ':')
    lsi = lsimodel.LsiModel(corpus, id2word=dictionary)
    #print("LSI: ", lsi.print_topics(5))
    lda = ldamodel.LdaModel(corpus, id2word=dictionary)
    #print("LDA: ", lda.print_topics(5))

    wc_lsi(cat, lsi, 0)
    wc_lsi(cat, lda, 1)
    def initialize(self, myid, dispatcher, **model_params):
        """Fully initialize the worker.

        Parameters
        ----------
        myid : int
            An ID number used to identify this worker in the dispatcher object.
        dispatcher : :class:`~gensim.models.lsi_dispatcher.Dispatcher`
            The dispatcher responsible for scheduling this worker.
        **model_params
            Keyword parameters to initialize the inner LSI model, see :class:`~gensim.models.lsimodel.LsiModel`.

        """
        self.lock_update = threading.Lock()
        self.jobsdone = 0  # how many jobs has this worker completed?
        # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
        self.myid = myid
        self.dispatcher = dispatcher
        self.finished = False
        logger.info("initializing worker #%s", myid)
        self.model = lsimodel.LsiModel(**model_params)
Ejemplo n.º 19
0
    def do_after_a_full_scan(self, num_of_finished_scans):
        # First iteration of a normal run (collecting the vocabulary)
        if num_of_finished_scans == 1 and self.num_of_scans == 3:
            self.vocab = Counter(self.all_words)

            self.all_words = {}
            for word in self.vocab:
                if self.vocab[word] >= self.min_count:
                    self.all_words[word] = len(self.all_words)

            self.vectors = lil_matrix(
                (len(self.all_words), self.number_of_tus), dtype=np.int8)

            print("-#-#-#-#-#-#-#-#-#-#-#-")
            print("size of vocab:", len(self.vocab))
            print("size of common words:", len(self.all_words))
            print("number of TUs:", self.number_of_tus)
            self.number_of_tus = 0

            f = open(self.dict_file_name, "a+")

            for w in self.all_words:
                f.write(w)
                f.write("\t" + str(self.all_words[w]) + "\n")
            f.close()

        # Second iteration of a normal run (making the tu-word matrix)
        elif num_of_finished_scans == 2:
            print("Performing SVD...")

            x = Sparse2Corpus(self.vectors)
            lsi = lsimodel.LsiModel(corpus=x,
                                    id2word=None,
                                    num_topics=self.num_of_features)
            lsi.save(self.model_file_name)
            self.vectors = lsi.projection.u

            print("done.")
        else:
            print("-#-#-#-#-#-#-#-#-#-#-#-")
Ejemplo n.º 20
0
	def train(self, filepath, dict_path, docs, num_topics = 5, chunksize = 2000):
		if(path.exists(filepath)):
			LOGGER.info('Model already exists...load model')
			self._inner_model = lsimodel.LsiModel.load(filepath)
		else:
			start = time.time()
			clean_docs = [d for d in docs]
			if(path.exists(dict_path)):
				LOGGER.info('Dictionary already exists...loading dictionary')
				self._dict = corpora.Dictionary.load(dict_path)
			else:
				self._dict = corpora.Dictionary(clean_docs)
				self._dict.save(dict_path)
				self.dict_time = (time.time() - start)
			corpus_dict = self._dict
			corpus = [self._dict.doc2bow(x) for x in clean_docs]
			#tfidf = tfidfmodel.TfidfModel(corpus)
			#corpus_tfidf = tfidf[corpus]
			self._inner_model = lsimodel.LsiModel(corpus, num_topics=num_topics, id2word=corpus_dict, chunksize=chunksize)
			self._inner_model.save(filepath)
			self.model_time = (time.time() - start)
		return self
Ejemplo n.º 21
0
def NLP_process(df,
                dictionary=None,
                post_lsi=None,
                title_lsi=None,
                num_lsi_topics=None,
                use_timer=True):
    """ Function for NLP pre-processing.  If dictionary isn't specified, 
        create it from the posts and titles.  If post_lsi and title_lsi are not
        specified, create them as well.
    """
    from gensim.models import lsimodel

    if use_timer:
        my_timer = SimpleTimer()
    posts_tokenized = ProcessText(df.selftext)
    #    posts_tokenized = []
    if use_timer:
        my_timer.elapsed('Processed Posts')

    titles_tokenized = ProcessText(df.title)
    if use_timer:
        my_timer.elapsed('Processed Titles')

    if not dictionary:
        dictionary = CreateCorpusDictionary(posts_tokenized + titles_tokenized)
        if use_timer:
            my_timer.elapsed('Created Dictionary')

    posts_vec = Vectorize_text(posts_tokenized, dictionary)
    titles_vec = Vectorize_text(titles_tokenized, dictionary)
    print(len(titles_vec), df.shape)
    df_new = df.copy()
    df_new = df_new.assign(post_word_len2=[len(post) for post in posts_vec])
    df_new = df_new.assign(title_word_len2=[len(post) for post in titles_vec])

    df_new = df_new[sorted(df_new.columns)]

    if use_timer:
        my_timer.elapsed('Vectorized')

    if not post_lsi:
        post_lsi = lsimodel.LsiModel(posts_vec,
                                     num_topics=num_lsi_topics,
                                     id2word=dictionary)
    if not title_lsi:
        title_lsi = lsimodel.LsiModel(titles_vec,
                                      num_topics=num_lsi_topics,
                                      id2word=dictionary)
        my_timer.elapsed('Trained LSI')

    post_lsi_features = ComputeDocumentLSIs(posts_vec,
                                            post_lsi,
                                            num_lsi_topics,
                                            label_base='post_lsi')
    if use_timer:
        my_timer.elapsed('Computed Post LSIs')
    title_lsi_features = ComputeDocumentLSIs(titles_vec,
                                             title_lsi,
                                             num_lsi_topics,
                                             label_base='title_lsi')
    if use_timer:
        my_timer.elapsed('Computed Title LSIs')

    post_lsi_features = post_lsi_features.set_index(df_new.index)
    title_lsi_features = title_lsi_features.set_index(df_new.index)

    df_new = df_new.join(post_lsi_features)
    df_new = df_new.join(title_lsi_features)
    df_new = df_new.drop(['selftext', 'title'], axis=1)

    if use_timer:
        my_timer.elapsed('Completed {} records'.format(len(df_new)))

    return (df_new, dictionary, post_lsi, title_lsi)
Ejemplo n.º 22
0
 def setUp(self):
     self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
     self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
    id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))
    
    corpus = MmCorpus(config.resultFile('bow.mm'))

    if method == 'tfidf':
        model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        model.save(config.resultFile('model_tfidf.pkl'))
    elif method == 'lda':
        model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
        model.save(config.resultFile('model_lda.pkl'))
    elif method == 'lsi':
        # first, transform word counts to tf-idf weights
        tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        # then find the transformation from tf-idf to latent space
        model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI)
        model.save(config.resultFile('model_lsi.pkl'))
    elif method == 'rp':
        # first, transform word counts to tf-idf weights
        tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        # then find the transformation from tf-idf to latent space
        model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP)
        model.save(config.resultFile('model_rp.pkl'))
    else:
        raise ValueError('unknown topic extraction method: %s' % repr(method))
    
    MmCorpus.saveCorpus(config.resultFile('corpus_%s.mm' % method), model[corpus])
            
    logging.info("finished running %s" % program)

Ejemplo n.º 24
0
ratings_df.loc[:, 'rating'] = sk.minmax_scale(ratings_df.loc[:, 'rating'])
print(ratings_df.loc[:, 'rating'])
print(ratings_df)
print(ratings_df.head())

R_df = ratings_df.pivot(index='user_id', columns='book_id',
                        values='rating').fillna(0).to_sparse(fill_value=0)
print(R_df.head())

R = R_df.as_matrix()
if (np.isinf(R).all() == False):
    print("tr")
##print(np.isinf(R),np.isnan(R))

Z = gensim.matutils.Dense2Corpus(R, documents_columns=True)
print(Z)

##user_ratings_mean = np.mean(R, axis = 1)
#print(R.size)
lsi = ls.LsiModel(Z, num_topics=3)
print("Sigma")

print(lsi.projection.s)
print("U")

print(lsi.projection.u)
print("VT")
V = gensim.matutils.corpus2dense(lsi[Z], len(
    lsi.projection.s)).T / lsi.projection.s
print(V)
Ejemplo n.º 25
0
    logging.info("loading word id mapping from %s" %
                 config.resultFile('wordids.txt'))
    id2word = corpora.DmlCorpus.loadDictionary(
        config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    if method == 'tfidf':
        corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
        model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
        model.save(config.resultFile('tfidfmodel.pkl'))
    elif method == 'lda':
        corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
        model = ldamodel.LdaModel(corpus, id2word=id2word, numTopics=DIM_LDA)
        model.save(config.resultFile('ldamodel%i.pkl' % DIM_LDA))
    elif method == 'lsi' or method == 'lsa':
        # first, transform word counts to tf-idf weights
        corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
        tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
        # then find the transformation from tf-idf to latent space
        model = lsimodel.LsiModel(tfidf.apply(corpus),
                                  id2word=id2word,
                                  numTopics=DIM_LSI)
        model.save(config.resultFile('lsimodel%i.pkl' % DIM_LSI))
    elif method == 'rp':
        raise NotImplementedError(
            "Random Projections not converted to the new interface yet")
    else:
        raise ValueError('unknown topic extraction method: %s' % repr(method))

    logging.info("finished running %s" % program)
Ejemplo n.º 26
0
Archivo: lsa.py Proyecto: mtlynch/Log
    if use_pickle:
        results = useThreads()
        dictionary = corpora.Dictionary(results)
        print(dictionary)
        dictionary.filter_extremes()
        print(dictionary)
        corpus = [dictionary.doc2bow(text) for text in results]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        with open('models//tfidf_corpus.pickle', 'wb') as output:
            pickle.dump(corpus_tfidf, output, pickle.HIGHEST_PROTOCOL)
        with open('models//dictionary.pickle', 'wb') as output:
            pickle.dump(dictionary, output, pickle.HIGHEST_PROTOCOL)
    else:
        with open('models//tfidf_corpus.pickle', 'rb') as input:
            corpus_tfidf = pickle.load(input)
        with open('models//dictionary.pickle', 'rb') as input:
            dictionary = pickle.load(input)
    lsimodel = lsimodel.LsiModel(corpus_tfidf,
                                 id2word=dictionary,
                                 num_topics=300)
    corpus_lsi = lsimodel[corpus_tfidf]
    #    lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=300, update_every=1, chunksize=10000, passes=1)
    #    lda.save("models//lda.pickle")
    # hdp = models.hdpmodel.HdpModel(corpus_tfidf, id2word=dictionary)
    # hdp.save("models//hdp.pickle")
    # hdp.update_expectations()
    # hdpformatter = models.hdpmodel.HdpTopicFormatter(hdp.id2word,hdp.m_lambda+hdp.m_eta)
    # pprint(hdpformatter.show_topics(topics=-1, topn=20))
    print(time.time() - start_time, "seconds")
Ejemplo n.º 27
0
#lemmatizer les mots dans le fichierIN
txt = [[
    lemm.lemmatize(unicode(word, 'utf-8')) for word in d.lower().split()
    if (word not in stop and len(word) > 3)
] for d in fileIN]
#calculer la frequence des mots dans le fichierIN
all_tokens = sum(txt, [])
#print type(all_tokens)
#fire un set de tous les tokens dans le fichierIN qui ont une frequence moin 2
tokens_once = set(word for word in set(all_tokens)
                  if all_tokens.count(word) < 2)
#si la freq d'un mot est plus qu'un, pour pas avoir des doublons
texts = [[word for word in text if word not in tokens_once] for text in txt]

dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]
#num_topics ici c'est le nombre de groupe qu'on veut sortir
lsi = lsimodel.LsiModel(corpus, id2word=dictionary, num_topics=20)

if len(fileIN) > 1:
    tfidf = models.TfidfModel(corpus)
    doctfidf = tfidf[corpus]
    #num_topics ici c'est le nombre de termes qu'on veut sortir pour chaque groupe
    lsit = lsimodel.LsiModel(doctfidf, id2word=dictionary, num_topics=10)

dd = dict()
for i in range(0, lsi.num_topics):
    fileOut.write(lsi.print_topic(i) + '\n')
    dd[i] = lsi.print_topic(i)
Ejemplo n.º 28
0
def build_model(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    # Use the if-idf corpus here, not the original one.
    mm = MmCorpus(mm_corpus_path)
    lsi = lsimodel.LsiModel(corpus=mm, id2word=dictionary, num_topics=400)
    lsi.save('/home/andre/Develop/corpora/lsamodel_lsi.model')
Ejemplo n.º 29
0
#Normalizing the ratings to fit in between 0 to 1 using squash function as  new x=(x - min/ max-min) .
# 1 will be now 0, 2 will be now 0.25, 3 will be now 0.5, 4 will be now 0.75, 5 will be now 1
ratings_dataset.loc[:,
                    'rating'] = sk.minmax_scale(ratings_dataset.loc[:,
                                                                    'rating'])
#Printing the first five rows after normalization
print(ratings_dataset.head())
#Reshape data (produce a “pivot” table) based on column values.
# Uses unique values from index / columns to form axes of the resulting DataFrame.
R = ratings_dataset.pivot(index='user_id', columns='song_id',
                          values='rating').fillna(0).to_sparse(fill_value=0)
print(R.head())
#Interpret the input as a matrix.
R_matrix = R.as_matrix()

#Treat dense numpy array as a streamed gensim corpus in BoW format.
R_corpus = gensim.matutils.Dense2Corpus(R_matrix, documents_columns=True)
print(R_corpus)
#Implements fast truncated SVD
lsi = ls.LsiModel(R_corpus, num_topics=3)

print("Sigma Matrix (Singular Values) :\n")
print(lsi.projection.s)

print("U Matrix : \n")
print(lsi.projection.u)

print("V Transpose Matrix :\n")
VT = gensim.matutils.corpus2dense(lsi[R_corpus], len(
    lsi.projection.s)).T / lsi.projection.s
print(VT)
Ejemplo n.º 30
0
def getTfidfLsiSims(corpus, confId, confIdtoIndex, dictionary, outputDir):
    print(
        "Using gensim to get TFIDF vector and LSI vector for conferences in corpus "
    )
    #tfidf
    tfidf = tfidfmodel.TfidfModel(
        corpus)  # initialize a tfidf transformation for corpus
    corpus_tfidf = tfidf[corpus]  # get tfidf vectors
    #lsi
    lsi = lsimodel.LsiModel(
        corpus_tfidf, id2word=dictionary, num_topics=4
    )  # initialize an LSI transformation for corpus, with number of topics = 4
    corpus_lsi = lsi[corpus_tfidf]

    ####### not important, just printing
    print("Printing TF-IDF vectors in " + outputDir + '/conffTFIDF.txt')
    fTFIDFFile = open(outputDir + '/conffTFIDF.txt', 'w')
    j = 0
    for doc in corpus_tfidf:
        print >> fTFIDFFile, confId[j], doc
        j = j + 1
        if j % 100 == 0:
            print(j)
    tfidf.save(outputDir + '/conftfidf.mod')

    #print "length of corpus is",len(corpus)

    printvectors = False
    if printvectors == True:
        i = 0
        for doc in corpus_tfidf:
            print("tfidf doc", confId[i], doc)
            i += 1

        i = 0
        for doc in corpus_lsi:
            print("lsi doc", confId[i], doc)
            i += 1
    ####### not important

    #compute similarity of corpus against itself
    listofMethods = ['corpus_lsi', 'corpus_tfidf']
    for method in listofMethods:
        if method == 'corpus_lsi':
            cor = corpus_lsi
        elif method == 'corpus_tfidf':
            cor = corpus_tfidf

        index = similarities.MatrixSimilarity(cor)
        confSims = dict()
        confSimsDict = dict()  # dictionary of [confId1][confId2]
        j = 0
        sims = []
        for vec_tfidf in cor:
            sims = index[vec_tfidf]
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            confSims[confId[j]] = sims  # in khat be dard nemikhore
            confSimsDict[j] = dict(sims)
            #print "index: ",confIdtoIndex[confId[j]], "confId: ", confId[j], confSims[confId[j]]
            j += 1

        if method == 'corpus_lsi':
            cslsi = dict()
            for c1index in confSimsDict.keys():
                cslsi[confId[c1index]] = dict()
                for c2index in confSimsDict.keys():
                    cslsi[confId[c1index]][
                        confId[c2index]] = confSimsDict[c1index][c2index]

        elif method == 'corpus_tfidf':
            cstfidf = dict()
            for c1index in confSimsDict.keys():
                cstfidf[confId[c1index]] = dict()
                for c2index in confSimsDict.keys():
                    cstfidf[confId[c1index]][
                        confId[c2index]] = confSimsDict[c1index][c2index]

    return cstfidf, cslsi