Exemple #1
0
    def testOnlineTransform(self):
        corpus = list(self.corpus)
        doc = corpus[0]  # use the corpus' first document for testing

        # create the transformation model
        model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5)  # compute everything at once
        model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5)  # start with no documents, we will add them later

        # train model on a single document
        model.add_documents([corpus[0]])

        # transform the testing document with this partial transformation
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, model.num_topics)  # convert to dense vector, for easier equality tests
        expected = np.array([-1.73205078, 0.0, 0.0, 0.0, 0.0])  # scaled LSI version
        self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6))  # transformed entries must be equal up to sign

        # train on another 4 documents
        model.add_documents(corpus[1:5], chunksize=2)  # train on 4 extra docs, in chunks of 2 documents, for the lols

        # transform a document with this partial transformation
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, model.num_topics)  # convert to dense vector, for easier equality tests
        expected = np.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269])  # scaled LSI version
        self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6))  # transformed entries must be equal up to sign

        # train on the rest of documents
        model.add_documents(corpus[5:])

        # make sure the final transformation is the same as if we had decomposed the whole corpus at once
        vec1 = matutils.sparse2full(model[doc], model.num_topics)
        vec2 = matutils.sparse2full(model2[doc], model2.num_topics)
        self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5))  # the two LSI representations must equal up to sign
    def testFull(self, num_best=None, shardsize=100):
        if self.cls == similarities.Similarity:
            index = self.cls(None, corpus, num_features=len(dictionary), shardsize=shardsize)
        else:
            index = self.cls(corpus, num_features=len(dictionary))
        if isinstance(index, similarities.MatrixSimilarity):
            expected = numpy.array([
                [0.57735026, 0.57735026, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.40824831, 0.0, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.0, 0.0, 0.0, 0.0],
                [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.40824831, 0.0, 0.0, 0.0, 0.81649661, 0.0, 0.40824831, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1., 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.70710677, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.57735026],
                [0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026],
                ], dtype=numpy.float32)
            # HACK: dictionary can be in different order, so compare in sorted order
            self.assertTrue(numpy.allclose(sorted(expected.flat), sorted(index.index.flat)))
        index.num_best = num_best
        query = corpus[0]
        sims = index[query]
        expected = [(0, 0.99999994), (2, 0.28867513), (3, 0.23570226), (1, 0.23570226)][ : num_best]

        # convert sims to full numpy arrays, so we can use allclose() and ignore
        # ordering of items with the same similarity value
        expected = matutils.sparse2full(expected, len(index))
        if num_best is not None: # when num_best is None, sims is already a numpy array
            sims = matutils.sparse2full(sims, len(index))
        self.assertTrue(numpy.allclose(expected, sims))
        if self.cls == similarities.Similarity:
            index.destroy()
Exemple #3
0
def ratejobs(jobs,useremail):
  global dict,corpus,tfidf,lsimodel,jobid2id,id2jobid
  reviews = list(db.reviews.find({'useremail': useremail}))
  labels = []
  reviewrowids = []
  # check for 0 reviews
  for review in reviews:
    if review['jobid'] in jobid2id:
      try:
        labels.append(float(review['rating']))
        reviewrowids.append(jobid2id[review['jobid']])
      except:
        print(review)

  if len(reviewrowids) == 0:
    return {jobid:-1 for jobid in jobs}
  else:
    samples = [lsimodel[tfidf[corpus[rowid]]] for rowid in reviewrowids]
    # can be linear, rbf with gamma, or poly with degree
    svmmodel = svm.SVR(kernel='rbf', C=1e3, gamma=0.01)

    # train our svm with the labeled data
    svmmodel.fit([matutils.sparse2full(sample,300) for sample in samples],labels)

    # now run the svm model over the jobs to find a rating for each job
    airatings = {}
    for jobid in jobs:
      if jobid in jobid2id:
        airatings[jobid] = (svmmodel.predict([matutils.sparse2full(lsimodel[tfidf[corpus[jobid2id[jobid]]]],300)])).item(0)
      else:
        airatings[jobid] = -1

    return airatings
Exemple #4
0
    def get_similarities(self, query):
        """
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = numpy.asarray(
                [matutils.sparse2full(vec, self.num_features) for vec in query],
                dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.toarray()  # convert sparse to dense
            elif isinstance(query, numpy.ndarray):
                pass
            else:
                # default case: query is a single vector in sparse gensim format
                query = matutils.sparse2full(query, self.num_features)
            query = numpy.asarray(query, dtype=self.index.dtype)

        # do a little transposition dance to stop numpy from making a copy of
        # self.index internally in numpy.dot (very slow).
        result = numpy.dot(self.index, query.T).T  # return #queries x #index
        return result  # XXX: removed casting the result from array to list; does anyone care?
Exemple #5
0
    def testSerialized(self):
        # Test the model using serialized corpora. Basic tests, plus test of update functionality.

        model = self.class_(
            self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2,
            serialized=True, serialization_path=datapath('testcorpus_serialization.mm')
        )

        jill_topics = model.get_author_topics('jill')
        jill_topics = matutils.sparse2full(jill_topics, model.num_topics)
        self.assertTrue(all(jill_topics > 0))

        model.update()
        jill_topics2 = model.get_author_topics('jill')
        jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics)

        # Did we learn more about Jill?
        self.assertFalse(all(np.equal(jill_topics, jill_topics2)))

        model.update(corpus_new, author2doc_new)

        # Did we learn something about Sally?
        sally_topics = model.get_author_topics('sally')
        sally_topics = matutils.sparse2full(sally_topics, model.num_topics)
        self.assertTrue(all(sally_topics > 0))

        # Delete the MmCorpus used for serialization inside the author-topic model.
        remove(datapath('testcorpus_serialization.mm'))
    def test_random_seed(self):
        if not self.mallet_path:
            return

        # test that 2 models created with the same random_seed are equal in their topics treatment
        SEED = 10
        NUM_TOPICS = 10
        ITER = 500

        tm1 = ldamallet.LdaMallet(
            self.mallet_path,
            corpus=corpus,
            num_topics=NUM_TOPICS,
            id2word=dictionary,
            random_seed=SEED,
            iterations=ITER,
        )

        tm2 = ldamallet.LdaMallet(
            self.mallet_path,
            corpus=corpus,
            num_topics=NUM_TOPICS,
            id2word=dictionary,
            random_seed=SEED,
            iterations=ITER,
        )
        self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics))

        for doc in corpus:
            tm1_vector = matutils.sparse2full(tm1[doc], NUM_TOPICS)
            tm2_vector = matutils.sparse2full(tm2[doc], NUM_TOPICS)

            self.assertTrue(np.allclose(tm1_vector, tm2_vector))
Exemple #7
0
def topics_hellinger(text1, text2):
    token1 = [i for i in jieba.cut(text1, cut_all=True)]
    token2 = [i for i in jieba.cut(text2, cut_all=True)]
    lda_vec1 = lda[dic.doc2bow(token1)]
    lda_vec2 = lda[dic.doc2bow(token2)]
    dense1 = matutils.sparse2full(lda_vec1, lda.num_topics)
    dense2 = matutils.sparse2full(lda_vec2, lda.num_topics)
    sim = np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2)) ** 2).sum())
    return sim
    def temporal_weekday_single_ven(self, ven_id):
        """
        Splits ven_id up into bins, returns bins in bow format.

        :param ven_id: ID of venue to split
        :type ven_id: str
        :return: 2-tuple (mean distance, standard deviation)
        :rtype: 2-tuple (float, float)
        """
        split_ven_topics = {}
        ven_weekdays = sq.split_weekdays(ven_id)

        # get inferred topic distribution for each split bin
        for iso_day in range(1, 8):
            try:
                # tokenize shouts, making a list of tokens for each split
                word_list = []
                for ven_shout in ven_weekdays[iso_day]:
                    word_list.extend(tokenize(ven_shout, self.corpus_type))
                # turn list of tokens into BOW format
                bow = self.cor.dictionary.doc2bow(word_list)
                # infer topic distribution for the split, store in split_ven_topics{}
                split_ven_topics[iso_day] = self.hdp[bow]
            except KeyError:
                split_ven_topics[iso_day] = []

        ven_name = (sq.get_ven_by_id(ven_id)).name
        ven_names = [u'{} ({})'.format(ven_name, ISO2DAY[iso_day]) for iso_day in range(1, 8)] + [ven_name]

        # make np.array of Hellinger distances between venue and splits
        distances = []
        # get dense vector representation of venue
        ven_vec = self.hdp.get_document_topics(self.cor[self.ven_id2i[ven_id]])
        ven_vec_dense = matutils.sparse2full(ven_vec, self.num_topics)
        # distances[0] = venue vs. venue
        distances.append(hellinger_distance(ven_vec_dense, ven_vec_dense))

        # print nearest neighbors for venue
        print(u'{} - nearest neighbors:'.format(ven_name))
        self.print_nn(ven_vec)

        # distances[i] = venue vs. iso_day i
        for key in range(1, 8):
            split_vec = split_ven_topics[key]
            split_vec_dense = matutils.sparse2full(split_vec, self.num_topics)
            distances.append(hellinger_distance(ven_vec_dense, split_vec_dense))
            # print nearest neighbors for split
            print(u'{} ({}) nearest neighbors:'.format(ven_name, ISO2DAY[key]))
            self.print_nn(split_vec)
        # convert distances into numpy array
        distances = np.asarray(distances)
        self.vis_time_bars(distances, ven_name)

        # mean distance and SD
        dists = distances[1:]
        return np.mean(dists), np.std(dists)
Exemple #9
0
    def testDoc2authorMissing(self):
        # Check that the results are the same if doc2author is constructed automatically from author2doc.
        model = self.class_(corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0)
        model2 = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, random_state=0)

        # Compare Jill's topics before in both models.
        jill_topics = model.get_author_topics('jill')
        jill_topics2 = model2.get_author_topics('jill')
        jill_topics = matutils.sparse2full(jill_topics, model.num_topics)
        jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics)
        self.assertTrue(np.allclose(jill_topics, jill_topics2))
Exemple #10
0
    def testUpdate(self):
        # Check that calling update after the model already has been trained works.
        model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2)

        jill_topics = model.get_author_topics('jill')
        jill_topics = matutils.sparse2full(jill_topics, model.num_topics)

        model.update()
        jill_topics2 = model.get_author_topics('jill')
        jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics)

        # Did we learn something?
        self.assertFalse(all(np.equal(jill_topics, jill_topics2)))
Exemple #11
0
    def testPersistenceCompressed(self):
        fname = testfile() + '.gz'
        model = self.model
        model.save(fname)
        model2 = self.class_.load(fname, mmap=None)
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta))

        # Compare Jill's topics before and after save/load.
        jill_topics = model.get_author_topics('jill')
        jill_topics2 = model2.get_author_topics('jill')
        jill_topics = matutils.sparse2full(jill_topics, model.num_topics)
        jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics)
        self.assertTrue(np.allclose(jill_topics, jill_topics2))
Exemple #12
0
def sparse2matrix(inpath1, inpath2, topics_num, file_name) :
    destpath = '/data/mallet_tests/hellinger/tmp_matrice_'+topics_num+'_'+file_name

    with open(inpath1, 'r') as comparator :
        with io.open(inpath2, 'r') as comparable :
            i = 0
            
            for line_tor in comparator :
                print line_tor.split()[:2]
                l_tor = line_tor.split()[2:]
                l_tor = tuple( (tuple (map (int, (i.split(':')))) for i in l_tor))
#                    print l_tor
                len_tor = int(topics_num.split('x')[0])
                mat_tor = mat.sparse2full(doc=l_tor,length=len_tor)
#                print mat_tor.size             

#                for line_ble in comparable :
                line_ble = comparable.readline()
                print line_ble.split()[:2]
                l_ble = line_ble.split()[2:]
                l_ble = tuple( (tuple (map (int, (i.split(':')))) for i in l_ble))
#                    print l_ble
                len_ble = int(topics_num.split('x')[1])
                mat_ble = mat.sparse2full(doc=l_ble,length=len_ble)
#                    print mat_ble.size
#                    sys.exit()

                matrix = n.zeros(shape=(len_ble,len_tor))
#                    print matrix
#                    sys.exit()
                for k in xrange(len_tor) :
#                        print 'ollaan koossa'
                    for j in xrange(len_ble) :
#                            print 'ollaan jiissa'
#                            matrix[j][k] = k*j
#                            print matrix
#                    sys.exit()    
                        matrix[j][k] = pow(abs((math.sqrt(mat_tor[k]) - math.sqrt(mat_ble[j]))),2)
                        print matrix[j][k]
                sys.exit()
                with open(destpath+'_'+line_tor.split()[1]+'.txt', 'w') as matrixfile :
                    matrixfile.write(str(mat.full2sparse(matrix)))
                matrixfile.closed
                print 'word %s done' %  line_ble.split()[:2]
                i += 1 
        comparator.closed
    comparable.closed
    
    print 'matrixes done'
Exemple #13
0
    def testUpdateNewDataOldAuthor(self):
        # Check that calling update with new documents and/or authors after the model already has
        # been trained works.
        # Test an author that already existed in the old dataset.
        model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2)

        jill_topics = model.get_author_topics('jill')
        jill_topics = matutils.sparse2full(jill_topics, model.num_topics)

        model.update(corpus_new, author2doc_new)
        jill_topics2 = model.get_author_topics('jill')
        jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics)

        # Did we learn more about Jill?
        self.assertFalse(all(np.equal(jill_topics, jill_topics2)))
Exemple #14
0
 def get_features(self, article):
     """
     Returns full features vector from article.
     Article should be a mongodb model
     """
     #check if features of article are current version
     try:
         feature_version = article.features.version
     except AttributeError as e:
         if str(e) == 'features':
             logger.error("Article %s does not have any features." % article.id)
             #article seems not to exist anymore go on
             raise 
          
     if feature_version != self.extractor.get_version():
         clean_content = article.clean_content
             
         #get new features
         new_features = self.extractor.get_features(clean_content)
             
         #save new features
         features = Features(version=self.extractor.get_version(), data=new_features)
         article.features = features
         try:
             article.save()
         except queryset.OperationError as e:
             logger.error("Could not save article with id %s: %s" % (article.id, e))
     
     #sparse2full converts list of 2-tuples to numpy array
     article_features_as_full_vec = matutils.sparse2full(article.features.data, self.num_features_)
     
     return article_features_as_full_vec
Exemple #15
0
    def transform(self, author_names):
        """Infer the topic probabilities for each author.

        Parameters
        ----------
        author_names : {iterable of str, str}
            Author name or sequence of author names whose topics will be identified.

        Returns
        -------
        numpy.ndarray
            Topic distribution for each input author.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of arrays
        if not isinstance(author_names, list):
            author_names = [author_names]
        # returning dense representation for compatibility with sklearn
        # but we should go back to sparse representation in the future
        topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names]
        return np.reshape(np.array(topics), (len(author_names), self.num_topics))
 def __init__(self, corpus, numBest = None, dtype = numpy.float32, numFeatures = None):
     """
     If `numBest` is left unspecified, similarity queries return a full list (one 
     float for every document in the corpus, including the query document):
     
     If `numBest` is set, queries return `numBest` most similar documents, as a 
     sorted list:
     
     >>> sms = MatrixSimilarity(corpus, numBest = 3)
     >>> sms[vec12]
     [(12, 1.0), (30, 0.95), (5, 0.45)]
     
     """
     if numFeatures is None:
         logging.info("scanning corpus of %i documents to determine the number of features" %
                      len(corpus))
         numFeatures = 1 + utils.getMaxId(corpus)
         
     logging.info("creating matrix for %i documents and %i features" % 
                  (len(corpus), numFeatures))
     self.numFeatures = numFeatures
     self.numBest = numBest
     self.corpus = numpy.empty(shape = (len(corpus), numFeatures), dtype = dtype, order = 'F')
     self.normalize = True
     
     # iterate over corpus, populating the numpy matrix
     for docNo, vector in enumerate(corpus):
         if docNo % 1000 == 0:
             logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus)))
         vector = matutils.unitVec(matutils.sparse2full(vector, numFeatures))
         self.corpus[docNo] = vector
     
     self.corpus = numpy.asmatrix(self.corpus)
Exemple #17
0
    def extract_interests(self, good_lsi, b_good_id):
        interests = []        
        while True:
            if len(interests) > self.MAX_INTEREST:
                print("已经达到最大兴趣点:%d" %(self.MAX_INTEREST))
                break
            
            max_id = 0
            max_list = []
            for k, v in b_good_id.items():
                if len(v) > len(max_list):
                    max_id = k
                    max_list = copy.deepcopy(v)
            
            if len(max_list) < 3:
                print("迭代结束!")
                break
            
            for rm_id in max_list:
                for k, v in b_good_id.items():
                    if rm_id in v:
                        v.remove(rm_id)
                        
            print("创建兴趣点:%d ~ %s" %(max_id, repr(max_list)))

            full_lsi = [ matutils.sparse2full(good_lsi[max_id], self.k_value) for id in max_list]
            full_lsi_array = np.array(full_lsi)
            interests.append(matutils.unitvec(np.average(full_lsi_array, axis=0)) )
        
        print("用户兴趣点个数:%d" %(len(interests)) )
        return np.array(interests)
Exemple #18
0
    def transform(self, docs):
        """Infer a matrix of topic distribution for the given document bow, where a_ij
        indicates (topic_i, topic_probability_j).

        Parameters
        ----------
        docs : {iterable of list of (int, number), list of (int, number)}
            Document or sequence of documents in BOW format.

        Returns
        -------
        numpy.ndarray of shape [`len(docs), num_topics`]
            Topic distribution for `docs`.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of array
        if isinstance(docs[0], tuple):
            docs = [docs]
        distribution, max_num_topics = [], 0

        for doc in docs:
            topicd = self.gensim_model[doc]
            distribution.append(topicd)
            max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1)

        # returning dense representation for compatibility with sklearn
        # but we should go back to sparse representation in the future
        distribution = [matutils.sparse2full(t, max_num_topics) for t in distribution]
        return np.reshape(np.array(distribution), (len(docs), max_num_topics))
def get_topics(model, corpus, by_ids=None, full=True):
    logger.info('Getting doc topic for corpus with length %d', len(corpus))
    doc_topic = list()
    corpus.metadata = True
    old_id2word = corpus.id2word
    corpus.id2word = model.id2word

    for doc, metadata in corpus:
        if by_ids is None or metadata[0] in by_ids:
            # get a vector where low topic values are zeroed out.
            topics = model[doc]
            if full:
                topics = sparse2full(topics, model.num_topics)

            # this gets the "full" vector that includes low topic values
            # topics = model.__getitem__(doc, eps=0)
            # topics = [val for id, val in topics]

            doc_topic.append((metadata, topics))

    corpus.metadata = False
    corpus.id2word = old_id2word
    logger.info('Returning doc topic of length %d', len(doc_topic))

    return doc_topic
    def initialize(self, corpus, chunks = 100, keepDecomposition = False, dtype = numpy.float64):
        """
        Run SVD decomposition on the corpus. This will define the latent space into 
        which terms and documents will be mapped.
        
        The SVD is created incrementally, in blocks of `chunks` documents. In the
        end, a `self.projection` matrix is constructed that can be used to transform 
        documents into the latent space. The `U, S, V` decomposition itself is 
        discarded, unless `keepDecomposition` is True, in which case it is stored 
        in `self.u`, `self.s` and `self.v`.
        
        The algorithm is adapted from:
        **M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition**
        """
        if self.id2word is None:
            logging.info("no word id mapping provided; initializing from corpus, assuming identity")
            maxId = -1
            for document in corpus:
                maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document]))
            self.numTerms = 1 + maxId
            self.id2word = dict(zip(xrange(self.numTerms), xrange(self.numTerms)))
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        
        # initialize decomposition (zero documents so far)
        self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics)), dtype = dtype)
        self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)), dtype = dtype)
        #self.v = numpy.matrix(numpy.zeros((0, self.numTopics)), dtype = dtype)
        self.v = None
        
        # do the actual work -- perform iterative singular value decomposition.
        # this is done by sequentially updating SVD with `chunks` new documents
        chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks)
        for chunkNo, (key, group) in enumerate(chunker):
            # convert the chunk of sparse documents to full vectors
            docs = [matutils.sparse2full(doc, self.numTerms) for docNo, doc in group]
#            self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents
            self.svdAddCols(docs, reorth = False)
            logging.info("processed documents up to #%s" % docNo)
        

        # calculate projection needed to get document-topic matrix from term-document matrix.
        #
        # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x,
        # so the projection is self.s^-1 * self.u^-1.
        #
        # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so
        # we pre-multiply v * s (ie., scale axes by singular values), and return
        # that directly as the representation of `x` in LSI space.
        #
        # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is 
        # just self.u.T
        # 
        # note that neither `v` (the right singular vectors) nor `s` (the singular 
        # values) are used at all in the transformation
        self.projection = self.u.T
        
        if not keepDecomposition:
            # once we have the projection stored in self, discard u*s*v decomposition to free up memory
            del self.u, self.v
Exemple #21
0
def get_features(article, extractor):
    '''
    Reaturns full features vector from article.
    Article should be a mongodb model
    '''
    #check if features of article are current version
    try:
        feature_version = article.features.version
    except AttributeError as e:
        if str(e) == 'features':
            logger.error("Article %s does not have any features." % 
                         article.id)
            #article seems not to exist anymore go on
            raise 
         
    if feature_version != extractor.get_version():
        clean_content = article.clean_content
            
        #get new features
        features = extractor.get_features(clean_content)
    else:
        features = article.features.data
    
    #sparse2full converts list of 2-tuples to numpy array
    article_features_as_full_vec = matutils.sparse2full(features, 
                                                        extractor.get_feature_number())
    
    return article_features_as_full_vec
Exemple #22
0
    def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunks=256):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.info("scanning corpus to determine the number of features")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunks = chunks

        if corpus is not None:
            logger.info("creating matrix for %s documents and %i features" %
                         (len(corpus), num_features))
            self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
                self.index[docno] = matutils.unitvec(matutils.sparse2full(vector, num_features))
Exemple #23
0
    def transform(self, docs):
        """Infer the topic distribution for `docs`.

        Parameters
        ----------
        docs : {iterable of list of (int, number), list of (int, number)}
            Document or sequence of documents in BoW format.

        Returns
        -------
        numpy.ndarray of shape [`len(docs)`, `num_topics`]
            The topic distribution for each input document.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of array
        if isinstance(docs[0], tuple):
            docs = [docs]
        # returning dense representation for compatibility with sklearn
        # but we should go back to sparse representation in the future
        distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
        return np.reshape(np.array(distribution), (len(docs), self.num_topics))
Exemple #24
0
    def transform(self, docs):
        """
        Takes a list of documents as input ('docs').
        Returns a matrix of topic distribution for the given document bow, where a_ij
        indicates (topic_i, topic_probability_j).
        The input `docs` should be in BOW format and can be a list of documents like
        [[(4, 1), (7, 1)],
        [(9, 1), (13, 1)], [(2, 1), (6, 1)]]
        or a single document like : [(4, 1), (7, 1)]
        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of array
        if isinstance(docs[0], tuple):
            docs = [docs]
        distribution, max_num_topics = [], 0

        for doc in docs:
            topicd = self.gensim_model[doc]
            distribution.append(topicd)
            max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1)

        # returning dense representation for compatibility with sklearn
        # but we should go back to sparse representation in the future
        distribution = [matutils.sparse2full(t, max_num_topics) for t in distribution]
        return np.reshape(np.array(distribution), (len(docs), max_num_topics))
    def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(25):  # restart at most 5 times
            # create the transformation model
            model = self.class_(id2word=dictionary, num_topics=2, passes=100)
            model.update(self.corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(transformed, 2)  # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(
                sorted(vec), sorted(expected), atol=1e-1
            )  # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning(
                "LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))
            )
        self.assertTrue(passed)
Exemple #26
0
 def __getitem__(self, bow, scaled=False):
     """
     Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as a result
     is_corpus, bow = utils.isCorpus(bow)
     if is_corpus:
         return self._apply(bow)
     
     assert self.projection.u is not None, "decomposition not initialized yet"
     vec = matutils.sparse2full(bow, self.numTerms).astype(self.projection.u.dtype)
     vec.shape = (self.numTerms, 1)
     assert self.projection.u.flags.f_contiguous
     dgemv = matutils.blas('gemv', self.projection.u)
     topicDist = dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x
     if scaled:
         topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x
     
     nnz = topicDist.nonzero()[0]
     return zip(nnz, topicDist[nnz])
Exemple #27
0
    def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
Exemple #28
0
    def testTransform(self):
        passed = False
        # sometimes, training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(25):  # restart at most 5 times
            # create the transformation model
            model = self.class_(id2word=dictionary, num_topics=2, passes=100, random_state=0)
            model.update(corpus, author2doc)

            jill_topics = model.get_author_topics('jill')

            # NOTE: this test may easily fail if the author-topic model is altered in any way. The model's
            # output is sensitive to a lot of things, like the scheduling of the updates, or like the
            # author2id (because the random initialization changes when author2id changes). If it does
            # fail, simply be aware of whether we broke something, or if it just naturally changed the
            # output of the model slightly.
            vec = matutils.sparse2full(jill_topics, 2)  # convert to dense vector, for easier equality tests
            expected = [0.91, 0.08]
            passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1)  # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning(
                "Author-topic model failed to converge on attempt %i (got %s, expected %s)",
                i, sorted(vec), sorted(expected)
            )
        self.assertTrue(passed)
Exemple #29
0
 def generator():
     for document in documents:
         vec = self.tfidf[self.lexicon.doc2bow(document)]
         if self.tofull:
             yield sparse2full(vec)
         else:
             yield vec
Exemple #30
0
    def __getitem__(self, bow, eps=1e-12):
        """
        Return esa representation of the input vector and/or corpus.
        
        bow should already be weights, e.g. with TF-IDF
        """
        # if the input vector is in fact a corpus, return a transformed corpus 
        # as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        #use corpus as interpreter matrix
        #simply multiply feature vector of input with corpus matrix
        #to get the weight of the concept
        vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus)

        #normalize
        vector = matutils.unitvec(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(concept_id, weight)
                  for concept_id, weight
                  in enumerate(vector)
                  if abs(weight) > eps]
        return vector
    def testTransform(self):
        # create the transformation model
        model = lsimodel.LsiModel(self.corpus, numTopics=2)

        # transform one document
        doc = list(self.corpus)[0]
        transformed = model[doc]
        vec = matutils.sparse2full(
            transformed,
            2)  # convert to dense vector, for easier equality tests

        expected = numpy.array([-0.6594664, 0.142115444])  # scaled LSI version
        # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version

        self.assertTrue(numpy.allclose(
            abs(vec),
            abs(expected)))  # transformed entries must be equal up to sign
Exemple #32
0
    def search(self, query):

        query_repr = read_ap.process_text(query)
        vec_query = self.corpus.dictionary.doc2bow(query_repr)
        lda_query = sparse2full(self.model[vec_query], self.num_topics)

        results = defaultdict(float)
        for doc_id, lda_doc_repr in zip(self.corpus.doc_ids,
                                        self.lda_corpus_pers):
            results[doc_id] = kullback_leibler(lda_query, lda_doc_repr)

        results = {
            k: v
            for k, v in sorted(
                results.items(), key=lambda item: item[1], reverse=True)
        }
        return list(results.items())
Exemple #33
0
    def testTransform(self):
        # create the transformation model
        numpy.random.seed(
            13
        )  # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results)
        model = rpmodel.RpModel(self.corpus, num_topics=2)

        # transform one document
        doc = list(self.corpus)[0]
        transformed = model[doc]
        vec = matutils.sparse2full(
            transformed,
            2)  # convert to dense vector, for easier equality tests

        expected = numpy.array([-0.70710677, 0.70710677])
        self.assertTrue(numpy.allclose(
            vec, expected))  # transformed entries must be equal up to sign
 def __init__(self, dictionary=None, corpus=None, index_file=None, max_docs=None, **kwargs):
     Corpus.__init__(self, dictionary=dictionary, corpus=corpus)
     self.clip_corpus(max_docs)
     # Set up for KNN
     features = len(self.dictionary)
     self.index = AnnoyIndex(features)
     start_time = datetime.datetime.now()
     if not index_file:
         self.transform_corpus(models.TfidfModel)
         for i, vector in enumerate(self):
             self.index.add_item(i, list(sparse2full(vector, features).astype(float)))
         self.index.build(self.no_trees)
     else:
         self.index.load(index_file)
     end_time = datetime.datetime.now()
     self.train_time = end_time - start_time
     return
 def testTransform(self):
     # create the transformation model
     passed = False
     numpy.random.seed(13)
     for i in xrange(10): # lda is randomized, so allow 10 iterations to test for equality
         model = ldamodel.LdaModel(self.corpus, numTopics = 2)
         
         # transform one document
         doc = list(self.corpus)[0]
         transformed = model[doc]
         
         vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
         expected = [0.0, 1.0]
         passed = passed or numpy.allclose(sorted(vec), sorted(expected))  # must contain the same values, up to re-ordering
         if passed:
             break
     self.assertTrue(passed, "Error in randomized LDA test")
Exemple #36
0
    def __getitem__(self, bow):
        """
        Return RP representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        vec = matutils.sparse2full(bow, self.num_terms).reshape(
            self.num_terms, 1) / numpy.sqrt(self.num_topics)
        vec = numpy.asfortranarray(vec, dtype=numpy.float32)
        topic_dist = numpy.dot(self.projection,
                               vec)  # (k, d) * (d, 1) = (k, 1)
        return [(topicid, float(topicvalue))
                for topicid, topicvalue in enumerate(topic_dist.flat)
                if numpy.isfinite(topicvalue)
                and not numpy.allclose(topicvalue, 0.0)]
Exemple #37
0
    def test_transform_float32(self):
        """Test lsi[vector] transformation."""
        # create the transformation model
        model = lsimodel.LsiModel(self.corpus, num_topics=2, dtype=np.float32)

        # make sure the decomposition is enough accurate
        u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
        self.assertTrue(np.allclose(s[:2], model.projection.s))  # singular values must match
        self.assertEqual(model.projection.u.dtype, np.float32)
        self.assertEqual(model.projection.s.dtype, np.float32)

        # transform one document
        doc = list(self.corpus)[0]
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, 2)  # convert to dense vector, for easier equality tests
        expected = np.array([-0.6594664, 0.142115444])  # scaled LSI version
        # transformed entries must be equal up to sign
        self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1.e-5))
 def testSparseTransform(self):
     if not self.mallet_path:
         return
     passed = False
     for i in range(5): # restart at most 5 times
         # create the sparse transformation model with the appropriate topic_threshold
         model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200, topic_threshold=0.5)
         # transform one document
         doc = list(corpus)[0]
         transformed = model[doc]
         vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
         expected = [1.0, 0.0]
         passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
         if passed:
             break
         logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                         (i, sorted(vec), sorted(expected)))
     self.assertTrue(passed)
Exemple #39
0
    def __init__(self,
                 corpus,
                 num_best=None,
                 dtype=numpy.float32,
                 num_features=None,
                 chunksize=256):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.info("scanning corpus to determine the number of features")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize

        if corpus is not None:
            logger.info("creating matrix for %s documents and %i features" %
                        (len(corpus), num_features))
            self.index = numpy.empty(shape=(len(corpus), num_features),
                                     dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i" %
                                 (docno, len(corpus)))
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(
                        matutils.sparse2full(vector, num_features))
                self.index[docno] = vector
Exemple #40
0
    def plot_author_clustering_interia(self, max_cluster=100, min_cluster=3):
        nips = self.model
        author_vecs = self.author_vecs
        X = [
            matutils.sparse2full(author, nips.num_topics)
            for author in author_vecs
        ]
        inertianew = []
        scaler = StandardScaler()
        scaler.fit(X)
        X_new = scaler.transform(X)

        for i in range(min_cluster, max_cluster):
            print('\nCreating K means clusters with cluters=%d' % i)
            kmeans = KMeans(n_clusters=i, random_state=0).fit(X_new)
            inertianew.append(kmeans.inertia_)

        plt.plot(list(range(min_cluster, max_cluster)), inertianew)
        plt.show()
Exemple #41
0
def run_evaluation(classifiers, models, eval_samples):
    ln.info("Beginning evaluation")
    classifications = dict()
    for modelname, classifier in classifiers.items():
        model = models[modelname]
        model_classifications = defaultdict(int)
        for sample_no, (eval_sample_text,
                        actual_label) in enumerate(eval_samples):
            bow = dictionary.doc2bow(simple_preprocess(eval_sample_text))
            model_features = sparse2full(model[bow], model.__out_size)
            predicted_label = classifier.predict(model_features)[0]

            model_classifications[(actual_label, predicted_label)] += 1
            if sample_no % 500 == 0:
                ln.debug("Classifier for %s evaluated %s samples so far." %
                         (modelname, sample_no))
        classifications[modelname] = model_classifications
    ln.info("Finished evaluation")
    return classifications
Exemple #42
0
def get_docs_emb_trained(docs: List[str], nlp: en.English, pattern_name: str,
                         model_dir: str) -> List[str]:
    """ Gets list of GloVe vectors for each string in docs using TF-IDF model.

    Loads previously stored doc dictionary and TF-IDF model to determine BOW
    representation of corpus.

    Args:
        docs: list of strings to get embeddings for.
        nlp: pretrained spacy model to use for embedding vectors.
        pattern_name: regex pattern name that generates list of docs (only to
                      be used for naming purposes of dictionary/model).
        model_dir: model directory to load dictionary and TF-IDF model from.

    Returns:
        List of GloVe embedded vectors. Has shape (len(docs), 300)
    """

    loaded_dict = Dictionary.load(model_dir + pattern_name + '_dict.dict')
    model = pickle.load(
        open(model_dir + pattern_name + '_tfidf_model.sav', 'rb'))

    corpus = [loaded_dict.doc2bow(doc) for doc in docs]

    if corpus:
        docs_tfidf = model[corpus]

        # extracts vector representation for each document from bag of words
        docs_vecs = np.vstack(
            [sparse2full(c, len(loaded_dict)) for c in docs_tfidf])

        # extracts nlp vector using pretrained model for each term in dictionary
        tfidf_emb_vecs = np.vstack(
            [nlp(loaded_dict[i]).vector for i in range(len(loaded_dict))])

        # gets nlp vector embedding of each doc
        docs_emb = np.dot(docs_vecs, tfidf_emb_vecs)

        return docs_emb

    # if corpus is empty according to dictionary
    return []
Exemple #43
0
    def __init__(self,
                 corpus,
                 numBest=None,
                 dtype=numpy.float32,
                 numFeatures=None):
        """
        If `numBest` is left unspecified, similarity queries return a full list (one
        float for every document in the corpus, including the query document):

        If `numBest` is set, queries return `numBest` most similar documents, as a
        sorted list:

        >>> sms = MatrixSimilarity(corpus, numBest = 3)
        >>> sms[vec12]
        [(12, 1.0), (30, 0.95), (5, 0.45)]

        """
        if numFeatures is None:
            logging.info(
                "scanning corpus of %i documents to determine the number of features"
                % len(corpus))
            numFeatures = 1 + utils.getMaxId(corpus)

        logging.info("creating matrix for %i documents and %i features" %
                     (len(corpus), numFeatures))
        self.numFeatures = numFeatures
        self.numBest = numBest
        self.corpus = numpy.empty(shape=(len(corpus), numFeatures),
                                  dtype=dtype)
        self.normalize = True

        if corpus is not None:
            # iterate over corpus, populating the numpy matrix
            for docNo, vector in enumerate(corpus):
                if docNo % 1000 == 0:
                    logging.info("PROGRESS: at document #%i/%i" %
                                 (docNo, len(corpus)))
                vector = matutils.unitVec(
                    matutils.sparse2full(vector, numFeatures))
                self.corpus[docNo] = vector

        self.corpus = numpy.asmatrix(self.corpus)
Exemple #44
0
    def print_top_topics_of_year(self, year, sp=True):
        model = self.model
        doc_vecs = self.doc_vecs
        topic_labels = self.topic_labels
        df = self.df_papers
        sparse_vecs = np.array(
            [matutils.sparse2full(vec, model.num_topics) for vec in doc_vecs])
        #tsne = TSNE(random_state=3211)
        #tsne_embedding = tsne.fit_transform(sparse_vecs)
        #years=df['year'].values
        #df_dist=pd.DataFrame({'year':years,'Topic_Distribution':sparse_vecs})
        df_sp = pd.DataFrame(sparse_vecs)
        df_dist = df_sp[df['year'] == year]
        top_topic = df_dist.sum().idxmax()
        value = df_dist.sum()[top_topic]
        print('Top topic: %s' % topic_labels[top_topic])
        print('Value: %f' % value)

        print('\n Following are the top words in the topic')
        print(self.model.show_topic(top_topic))
        #data = df[df['year']<=year]
        #        ax=df_dist.sum().plot(kind='bar')
        #        ax.set_xticklabels(topic_labels, rotation=90)
        #        plt.title('Topic Score for the year:%d' %year)
        #        plt.show()
        #
        df = pd.DataFrame({
            'Topic': topic_labels,
            'TopicDistribution': df_dist.sum().values
        })

        output_file("toptpic.html")

        p = Bar(df,
                'Topic',
                values='TopicDistribution',
                title="Bar Plot of Topic Distributions for the year %d" % year,
                color='green')
        if (sp):
            show(p)

        return df_dist
Exemple #45
0
    def __getitem__(self, bow):
        """
        Return RP representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        if getattr(self, 'freshly_loaded', False):
            # This is a hack to work around a bug in numpy, where a FORTRAN-order array
            # unpickled from disk segfaults on using it.
            self.freshly_loaded = False
            self.projection = self.projection.copy('F')  # simply making a fresh copy fixes the broken array

        vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics)
        vec = numpy.asfortranarray(vec, dtype=numpy.float32)
        topic_dist = numpy.dot(self.projection, vec)  # (k, d) * (d, 1) = (k, 1)
        return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
                if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
Exemple #46
0
def get_user_latent_vector(user_action, path):
    """
    get user latent vector
    Args:
        user_action: user action
        path: lda model path
    Return:
        user_latent_vector: {userid: np.ndarray([v1, v2, v3])}
    """
    texts = [x[2] for x in user_action]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    if not os.path.exists(path):
        lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=F, id2word=dictionary)
        lda.save(path)
    else:
        lda = models.ldamodel.LdaModel.load(path)
    topics = lda.get_document_topics(corpus)
    user_latent_vector = {user_action[i][0]: sparse2full(topics[i], lda.num_topics) for i in range(len(texts))}
    return user_latent_vector
 def __getitem__(self, bow, scaled = True):
     """
     Return latent distribution, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     vec = matutils.sparse2full(bow, self.numTerms)
     vec.shape = (self.numTerms, 1)
     assert vec.dtype == numpy.float32 and self.projection.dtype == numpy.float32
     topicDist = self.projection * vec
     if not scaled:
         topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
 def __getitem__(self, bow, scaled = False):
     """
     Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     assert self.projection.u is not None, "decomposition not initialized yet"
     vec = numpy.asfortranarray(matutils.sparse2full(bow, self.numTerms), dtype = self.projection.u.dtype)
     vec.shape = (self.numTerms, 1)
     topicDist = scipy.linalg.fblas.dgemv(1.0, self.projection.u, vec, trans = True) # u^T * x
     if scaled:
         topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
    def analyze(self, sentence, nlp):
        parsed_sentence = tokenize(nlp(sentence))
        #sent_corp = [self.text_dict.doc2bow(parsed_sentence)]
        sent_corp = [
            get_sent_bow(parsed_sentence,
                         self.text_dict,
                         nlp,
                         preload=self.preload)
        ]
        sent_tfidf = self.tfidf[sent_corp]
        sent_vec = np.vstack(
            [sparse2full(c, len(self.text_dict)) for c in sent_tfidf])
        glove_vec = np.dot(sent_vec, self.tf_glove)
        try:
            red_glove = self.pca.transform(glove_vec)
        except ValueError:
            return glove_vec, sent_corp

        prob = self.classifier.predict_proba(red_glove)
        val = get_classification_values(red_glove, self.classifier)[0]
        return val
Exemple #50
0
    def testTransformSerialized(self):
        # Same as testTransform, using serialized corpora.
        passed = False
        # sometimes, training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(25):  # restart at most 5 times
            # create the transformation model
            model = self.class_(
                id2word=dictionary,
                num_topics=2,
                passes=100,
                random_state=0,
                serialized=True,
                serialization_path=datapath('testcorpus_serialization.mm'))
            model.update(self.corpus, author2doc)

            jill_topics = model.get_author_topics('jill')

            # NOTE: this test may easily fail if the author-topic model is altered in any way. The model's
            # output is sensitive to a lot of things, like the scheduling of the updates, or like the
            # author2id (because the random initialization changes when author2id changes). If it does
            # fail, simply be aware of whether we broke something, or if it just naturally changed the
            # output of the model slightly.
            vec = matutils.sparse2full(
                jill_topics,
                2)  # convert to dense vector, for easier equality tests
            expected = [0.91, 0.08]
            passed = np.allclose(
                sorted(vec), sorted(expected),
                atol=1e-1)  # must contain the same values, up to re-ordering

            # Delete the MmCorpus used for serialization inside the author-topic model.
            remove(datapath('testcorpus_serialization.mm'))
            if passed:
                break
            logging.warning(
                "Author-topic model failed to converge on attempt %i (got %s, expected %s)"
                % (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed)
Exemple #51
0
    def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(5): # restart at most 5 times
            # create the transformation model
            model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                            (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed)
Exemple #52
0
    def transform(self, docs):
        """
        Takes a list of documents as input ('docs').
        Returns a matrix of topic distribution for the given document bow, where a_ij
        indicates (topic_i, topic_probability_j).
        The input `docs` should be in BOW format and can be a list of documents like
        [[(4, 1), (7, 1)],
        [(9, 1), (13, 1)], [(2, 1), (6, 1)]]
        or a single document like : [(4, 1), (7, 1)]
        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of array
        if isinstance(docs[0], tuple):
            docs = [docs]
        # returning dense representation for compatibility with sklearn
        # but we should go back to sparse representation in the future
        distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
        return np.reshape(np.array(distribution), (len(docs), self.num_topics))
Exemple #53
0
    def transform(self, docs):
        """
        Takes a list of documents as input ('docs').
        Returns a matrix of topic distribution for the given document bow, where a_ij
        indicates (topic_i, topic_probability_j).
        The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ]
        or a single document like : [(4, 1), (7, 1)]
        """
        if self.gensim_model is None:
            raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

        # The input as array of array
        check = lambda x: [x] if isinstance(x[0], tuple) else x
        docs = check(docs)
        X = [[] for _ in range(0, len(docs))]

        for k, v in enumerate(docs):
            doc_topics = self.gensim_model[v]
            # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
            probs_docs = matutils.sparse2full(doc_topics, self.num_topics)
            X[k] = probs_docs
        return np.reshape(np.array(X), (len(docs), self.num_topics))
Exemple #54
0
    def show_author_by_id(self, author):
        model = self.model
        topic_labels = self.topic_labels
        print('\n%s' % author)
        #print('Docs:', model.author2doc[author])
        print('Topics:')
        print([(topic_labels[topic[0]], topic[1]) for topic in model[author]])
        dist = matutils.sparse2full(model[author], model.num_topics)
        df = pd.DataFrame({'Topic': topic_labels, 'Score': dist})
        #plt.plot(dist)
        #        ax=df['Score'].plot(kind='bar')
        #        ax.set_xticklabels(topic_labels, rotation=90)
        #        output_file("AuthorTopicDistribution.html")
        #        plt.show()
        p = Bar(df,
                'Topic',
                values='Score',
                title="Bar Plot of Topic Distributions of %s" %
                self.get_author_name_from_id(author))
        show(p)

        print(self.get_author_name_from_id(author))
Exemple #55
0
    def testTransform(self):
        # create the transformation model
        model = lsimodel.LsiModel(self.corpus, numTopics=2)

        # make sure the decomposition is enough accurate
        u, s, vt = numpy.linalg.svd(matutils.corpus2dense(
            self.corpus, self.corpus.numTerms),
                                    full_matrices=False)
        self.assertTrue(numpy.allclose(
            s[:2], model.projection.s))  # singular values must match

        # transform one document
        doc = list(self.corpus)[0]
        transformed = model[doc]
        vec = matutils.sparse2full(
            transformed,
            2)  # convert to dense vector, for easier equality tests

        expected = numpy.array([-0.6594664, 0.142115444])  # scaled LSI version
        # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
        self.assertTrue(numpy.allclose(
            abs(vec),
            abs(expected)))  # transformed entries must be equal up to sign
    def transform(self, author_names):
        """
        Return topic distribution for input authors as a list of
        (topic_id, topic_probabiity) 2-tuples.
        """
        # The input as array of array
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        check = lambda x: [x] if not isinstance(x, list) else x
        author_names = check(author_names)
        X = [[] for _ in range(0, len(author_names))]

        for k, v in enumerate(author_names):
            transformed_author = self.gensim_model[v]
            # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
            probs_author = matutils.sparse2full(transformed_author,
                                                self.num_topics)
            X[k] = probs_author

        return np.reshape(np.array(X), (len(author_names), self.num_topics))
Exemple #57
0
def get_sims(model, query, corpus_full, dictionary, n_topics):
    ''' get ranking for single query '''

    # avoid division by 0
    eps = 1e-8

    # process query
    query_processed = read_ap.process_text(query)
    query_bow = dictionary.doc2bow(query_processed)
    q_lda = sparse2full(model[query_bow], n_topics)
    q_lda += eps

    sims = []

    # loop over all docs
    for i, doc in enumerate(corpus_full):
        doc += eps
        sim = -1 * kullback_leibler(q_lda, doc)
        sims.append(sim)

    sim_ordered = sorted(enumerate(sims), key=lambda item: -1 * item[1])

    return sim_ordered
 def __get_item__(self, val, rows = True, sparse = True, use_ids = None):
     
     if not sparse and use_ids != True:
         raise Exception("Cannot use original values if format is not sparse")
     
     size = -1
     if rows:
         dct = self.row_tallies[val]
         lookup = lambda id: self.row_id_gen.get_key(id)
         size = self.row_count()
     else:
         dct = self.col_tallies[val]
         lookup = lambda id: self.col_id_gen.get_key(id)
         size = self.col_count()
     
     sitems = sorted(dct.items(), key = lambda (k,v): k)
     if not sparse:
         return matutils.sparse2full(sitems, size)
         
     if use_ids:
         return sitems
     else:
         return [(lookup(k), v) for k,v in sitems]
Exemple #59
0
    def getSimilarities(self, doc):
        """
        Return similarity of sparse vector `doc` to all documents in the corpus.

        `doc` may be either a bag-of-words iterable (standard corpus document),
        or a numpy array, or a `scipy.sparse` matrix.
        """
        if scipy.sparse.issparse(doc):
            vec = doc.toarray().flatten()
        elif isinstance(doc, numpy.ndarray):
            vec = doc
        else:
            vec = matutils.sparse2full(doc, self.numFeatures)
        vec = numpy.asfortranarray(vec, dtype=self.corpus.dtype).reshape(
            self.numFeatures, 1)

        # compute cosine similarity against every other document in the collection
        gemv = matutils.blas('gemv', self.corpus)
        allSims = gemv(1.0, self.corpus, vec)  # N x T * T x 1 = N x 1
        allSims = list(allSims.flat)  # convert to plain python list
        assert len(allSims) == self.corpus.shape[
            0]  # make sure no document got lost!
        return allSims
    def transform(self, docs):
        """Infer a matrix of topic distribution for the given document bow, where a_ij
        indicates (topic_i, topic_probability_j).

        Parameters
        ----------
        docs : {iterable of list of (int, number), list of (int, number)}
            Document or sequence of documents in BOW format.

        Returns
        -------
        numpy.ndarray of shape [`len(docs), num_topics`]
            Topic distribution for `docs`.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of array
        if isinstance(docs[0], tuple):
            docs = [docs]
        distribution, max_num_topics = [], 0

        for doc in docs:
            topicd = self.gensim_model[doc]
            distribution.append(topicd)
            max_num_topics = max(max_num_topics,
                                 max(topic[0] for topic in topicd) + 1)

        # returning dense representation for compatibility with sklearn
        # but we should go back to sparse representation in the future
        distribution = [
            matutils.sparse2full(t, max_num_topics) for t in distribution
        ]
        return np.reshape(np.array(distribution), (len(docs), max_num_topics))