def testOnlineTransform(self): corpus = list(self.corpus) doc = corpus[0] # use the corpus' first document for testing # create the transformation model model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later # train model on a single document model.add_documents([corpus[0]]) # transform the testing document with this partial transformation transformed = model[doc] vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests expected = np.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on another 4 documents model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols # transform a document with this partial transformation transformed = model[doc] vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests expected = np.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on the rest of documents model.add_documents(corpus[5:]) # make sure the final transformation is the same as if we had decomposed the whole corpus at once vec1 = matutils.sparse2full(model[doc], model.num_topics) vec2 = matutils.sparse2full(model2[doc], model2.num_topics) self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign
def testFull(self, num_best=None, shardsize=100): if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=shardsize) else: index = self.cls(corpus, num_features=len(dictionary)) if isinstance(index, similarities.MatrixSimilarity): expected = numpy.array([ [0.57735026, 0.57735026, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.40824831, 0.0, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.40824831, 0.0, 0.0, 0.0, 0.81649661, 0.0, 0.40824831, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1., 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.70710677, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.57735026], [0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026], ], dtype=numpy.float32) # HACK: dictionary can be in different order, so compare in sorted order self.assertTrue(numpy.allclose(sorted(expected.flat), sorted(index.index.flat))) index.num_best = num_best query = corpus[0] sims = index[query] expected = [(0, 0.99999994), (2, 0.28867513), (3, 0.23570226), (1, 0.23570226)][ : num_best] # convert sims to full numpy arrays, so we can use allclose() and ignore # ordering of items with the same similarity value expected = matutils.sparse2full(expected, len(index)) if num_best is not None: # when num_best is None, sims is already a numpy array sims = matutils.sparse2full(sims, len(index)) self.assertTrue(numpy.allclose(expected, sims)) if self.cls == similarities.Similarity: index.destroy()
def ratejobs(jobs,useremail): global dict,corpus,tfidf,lsimodel,jobid2id,id2jobid reviews = list(db.reviews.find({'useremail': useremail})) labels = [] reviewrowids = [] # check for 0 reviews for review in reviews: if review['jobid'] in jobid2id: try: labels.append(float(review['rating'])) reviewrowids.append(jobid2id[review['jobid']]) except: print(review) if len(reviewrowids) == 0: return {jobid:-1 for jobid in jobs} else: samples = [lsimodel[tfidf[corpus[rowid]]] for rowid in reviewrowids] # can be linear, rbf with gamma, or poly with degree svmmodel = svm.SVR(kernel='rbf', C=1e3, gamma=0.01) # train our svm with the labeled data svmmodel.fit([matutils.sparse2full(sample,300) for sample in samples],labels) # now run the svm model over the jobs to find a rating for each job airatings = {} for jobid in jobs: if jobid in jobid2id: airatings[jobid] = (svmmodel.predict([matutils.sparse2full(lsimodel[tfidf[corpus[jobid2id[jobid]]]],300)])).item(0) else: airatings[jobid] = -1 return airatings
def get_similarities(self, query): """ Return similarity of sparse vector `query` to all documents in the corpus, as a numpy array. If `query` is a collection of documents, return a 2D array of similarities of each document in `query` to all documents in the corpus (=batch query, faster than processing each document in turn). **Do not use this function directly; use the self[query] syntax instead.** """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = numpy.asarray( [matutils.sparse2full(vec, self.num_features) for vec in query], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.toarray() # convert sparse to dense elif isinstance(query, numpy.ndarray): pass else: # default case: query is a single vector in sparse gensim format query = matutils.sparse2full(query, self.num_features) query = numpy.asarray(query, dtype=self.index.dtype) # do a little transposition dance to stop numpy from making a copy of # self.index internally in numpy.dot (very slow). result = numpy.dot(self.index, query.T).T # return #queries x #index return result # XXX: removed casting the result from array to list; does anyone care?
def testSerialized(self): # Test the model using serialized corpora. Basic tests, plus test of update functionality. model = self.class_( self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, serialized=True, serialization_path=datapath('testcorpus_serialization.mm') ) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) self.assertTrue(all(jill_topics > 0)) model.update() jill_topics2 = model.get_author_topics('jill') jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) # Did we learn more about Jill? self.assertFalse(all(np.equal(jill_topics, jill_topics2))) model.update(corpus_new, author2doc_new) # Did we learn something about Sally? sally_topics = model.get_author_topics('sally') sally_topics = matutils.sparse2full(sally_topics, model.num_topics) self.assertTrue(all(sally_topics > 0)) # Delete the MmCorpus used for serialization inside the author-topic model. remove(datapath('testcorpus_serialization.mm'))
def test_random_seed(self): if not self.mallet_path: return # test that 2 models created with the same random_seed are equal in their topics treatment SEED = 10 NUM_TOPICS = 10 ITER = 500 tm1 = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, random_seed=SEED, iterations=ITER, ) tm2 = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, random_seed=SEED, iterations=ITER, ) self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics)) for doc in corpus: tm1_vector = matutils.sparse2full(tm1[doc], NUM_TOPICS) tm2_vector = matutils.sparse2full(tm2[doc], NUM_TOPICS) self.assertTrue(np.allclose(tm1_vector, tm2_vector))
def topics_hellinger(text1, text2): token1 = [i for i in jieba.cut(text1, cut_all=True)] token2 = [i for i in jieba.cut(text2, cut_all=True)] lda_vec1 = lda[dic.doc2bow(token1)] lda_vec2 = lda[dic.doc2bow(token2)] dense1 = matutils.sparse2full(lda_vec1, lda.num_topics) dense2 = matutils.sparse2full(lda_vec2, lda.num_topics) sim = np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2)) ** 2).sum()) return sim
def temporal_weekday_single_ven(self, ven_id): """ Splits ven_id up into bins, returns bins in bow format. :param ven_id: ID of venue to split :type ven_id: str :return: 2-tuple (mean distance, standard deviation) :rtype: 2-tuple (float, float) """ split_ven_topics = {} ven_weekdays = sq.split_weekdays(ven_id) # get inferred topic distribution for each split bin for iso_day in range(1, 8): try: # tokenize shouts, making a list of tokens for each split word_list = [] for ven_shout in ven_weekdays[iso_day]: word_list.extend(tokenize(ven_shout, self.corpus_type)) # turn list of tokens into BOW format bow = self.cor.dictionary.doc2bow(word_list) # infer topic distribution for the split, store in split_ven_topics{} split_ven_topics[iso_day] = self.hdp[bow] except KeyError: split_ven_topics[iso_day] = [] ven_name = (sq.get_ven_by_id(ven_id)).name ven_names = [u'{} ({})'.format(ven_name, ISO2DAY[iso_day]) for iso_day in range(1, 8)] + [ven_name] # make np.array of Hellinger distances between venue and splits distances = [] # get dense vector representation of venue ven_vec = self.hdp.get_document_topics(self.cor[self.ven_id2i[ven_id]]) ven_vec_dense = matutils.sparse2full(ven_vec, self.num_topics) # distances[0] = venue vs. venue distances.append(hellinger_distance(ven_vec_dense, ven_vec_dense)) # print nearest neighbors for venue print(u'{} - nearest neighbors:'.format(ven_name)) self.print_nn(ven_vec) # distances[i] = venue vs. iso_day i for key in range(1, 8): split_vec = split_ven_topics[key] split_vec_dense = matutils.sparse2full(split_vec, self.num_topics) distances.append(hellinger_distance(ven_vec_dense, split_vec_dense)) # print nearest neighbors for split print(u'{} ({}) nearest neighbors:'.format(ven_name, ISO2DAY[key])) self.print_nn(split_vec) # convert distances into numpy array distances = np.asarray(distances) self.vis_time_bars(distances, ven_name) # mean distance and SD dists = distances[1:] return np.mean(dists), np.std(dists)
def testDoc2authorMissing(self): # Check that the results are the same if doc2author is constructed automatically from author2doc. model = self.class_(corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0) model2 = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, random_state=0) # Compare Jill's topics before in both models. jill_topics = model.get_author_topics('jill') jill_topics2 = model2.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) self.assertTrue(np.allclose(jill_topics, jill_topics2))
def testUpdate(self): # Check that calling update after the model already has been trained works. model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) model.update() jill_topics2 = model.get_author_topics('jill') jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) # Did we learn something? self.assertFalse(all(np.equal(jill_topics, jill_topics2)))
def testPersistenceCompressed(self): fname = testfile() + '.gz' model = self.model model.save(fname) model2 = self.class_.load(fname, mmap=None) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) # Compare Jill's topics before and after save/load. jill_topics = model.get_author_topics('jill') jill_topics2 = model2.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) self.assertTrue(np.allclose(jill_topics, jill_topics2))
def sparse2matrix(inpath1, inpath2, topics_num, file_name) : destpath = '/data/mallet_tests/hellinger/tmp_matrice_'+topics_num+'_'+file_name with open(inpath1, 'r') as comparator : with io.open(inpath2, 'r') as comparable : i = 0 for line_tor in comparator : print line_tor.split()[:2] l_tor = line_tor.split()[2:] l_tor = tuple( (tuple (map (int, (i.split(':')))) for i in l_tor)) # print l_tor len_tor = int(topics_num.split('x')[0]) mat_tor = mat.sparse2full(doc=l_tor,length=len_tor) # print mat_tor.size # for line_ble in comparable : line_ble = comparable.readline() print line_ble.split()[:2] l_ble = line_ble.split()[2:] l_ble = tuple( (tuple (map (int, (i.split(':')))) for i in l_ble)) # print l_ble len_ble = int(topics_num.split('x')[1]) mat_ble = mat.sparse2full(doc=l_ble,length=len_ble) # print mat_ble.size # sys.exit() matrix = n.zeros(shape=(len_ble,len_tor)) # print matrix # sys.exit() for k in xrange(len_tor) : # print 'ollaan koossa' for j in xrange(len_ble) : # print 'ollaan jiissa' # matrix[j][k] = k*j # print matrix # sys.exit() matrix[j][k] = pow(abs((math.sqrt(mat_tor[k]) - math.sqrt(mat_ble[j]))),2) print matrix[j][k] sys.exit() with open(destpath+'_'+line_tor.split()[1]+'.txt', 'w') as matrixfile : matrixfile.write(str(mat.full2sparse(matrix))) matrixfile.closed print 'word %s done' % line_ble.split()[:2] i += 1 comparator.closed comparable.closed print 'matrixes done'
def testUpdateNewDataOldAuthor(self): # Check that calling update with new documents and/or authors after the model already has # been trained works. # Test an author that already existed in the old dataset. model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) model.update(corpus_new, author2doc_new) jill_topics2 = model.get_author_topics('jill') jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) # Did we learn more about Jill? self.assertFalse(all(np.equal(jill_topics, jill_topics2)))
def get_features(self, article): """ Returns full features vector from article. Article should be a mongodb model """ #check if features of article are current version try: feature_version = article.features.version except AttributeError as e: if str(e) == 'features': logger.error("Article %s does not have any features." % article.id) #article seems not to exist anymore go on raise if feature_version != self.extractor.get_version(): clean_content = article.clean_content #get new features new_features = self.extractor.get_features(clean_content) #save new features features = Features(version=self.extractor.get_version(), data=new_features) article.features = features try: article.save() except queryset.OperationError as e: logger.error("Could not save article with id %s: %s" % (article.id, e)) #sparse2full converts list of 2-tuples to numpy array article_features_as_full_vec = matutils.sparse2full(article.features.data, self.num_features_) return article_features_as_full_vec
def transform(self, author_names): """Infer the topic probabilities for each author. Parameters ---------- author_names : {iterable of str, str} Author name or sequence of author names whose topics will be identified. Returns ------- numpy.ndarray Topic distribution for each input author. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # The input as array of arrays if not isinstance(author_names, list): author_names = [author_names] # returning dense representation for compatibility with sklearn # but we should go back to sparse representation in the future topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names] return np.reshape(np.array(topics), (len(author_names), self.num_topics))
def __init__(self, corpus, numBest = None, dtype = numpy.float32, numFeatures = None): """ If `numBest` is left unspecified, similarity queries return a full list (one float for every document in the corpus, including the query document): If `numBest` is set, queries return `numBest` most similar documents, as a sorted list: >>> sms = MatrixSimilarity(corpus, numBest = 3) >>> sms[vec12] [(12, 1.0), (30, 0.95), (5, 0.45)] """ if numFeatures is None: logging.info("scanning corpus of %i documents to determine the number of features" % len(corpus)) numFeatures = 1 + utils.getMaxId(corpus) logging.info("creating matrix for %i documents and %i features" % (len(corpus), numFeatures)) self.numFeatures = numFeatures self.numBest = numBest self.corpus = numpy.empty(shape = (len(corpus), numFeatures), dtype = dtype, order = 'F') self.normalize = True # iterate over corpus, populating the numpy matrix for docNo, vector in enumerate(corpus): if docNo % 1000 == 0: logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus))) vector = matutils.unitVec(matutils.sparse2full(vector, numFeatures)) self.corpus[docNo] = vector self.corpus = numpy.asmatrix(self.corpus)
def extract_interests(self, good_lsi, b_good_id): interests = [] while True: if len(interests) > self.MAX_INTEREST: print("已经达到最大兴趣点:%d" %(self.MAX_INTEREST)) break max_id = 0 max_list = [] for k, v in b_good_id.items(): if len(v) > len(max_list): max_id = k max_list = copy.deepcopy(v) if len(max_list) < 3: print("迭代结束!") break for rm_id in max_list: for k, v in b_good_id.items(): if rm_id in v: v.remove(rm_id) print("创建兴趣点:%d ~ %s" %(max_id, repr(max_list))) full_lsi = [ matutils.sparse2full(good_lsi[max_id], self.k_value) for id in max_list] full_lsi_array = np.array(full_lsi) interests.append(matutils.unitvec(np.average(full_lsi_array, axis=0)) ) print("用户兴趣点个数:%d" %(len(interests)) ) return np.array(interests)
def transform(self, docs): """Infer a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). Parameters ---------- docs : {iterable of list of (int, number), list of (int, number)} Document or sequence of documents in BOW format. Returns ------- numpy.ndarray of shape [`len(docs), num_topics`] Topic distribution for `docs`. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # The input as array of array if isinstance(docs[0], tuple): docs = [docs] distribution, max_num_topics = [], 0 for doc in docs: topicd = self.gensim_model[doc] distribution.append(topicd) max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1) # returning dense representation for compatibility with sklearn # but we should go back to sparse representation in the future distribution = [matutils.sparse2full(t, max_num_topics) for t in distribution] return np.reshape(np.array(distribution), (len(docs), max_num_topics))
def get_topics(model, corpus, by_ids=None, full=True): logger.info('Getting doc topic for corpus with length %d', len(corpus)) doc_topic = list() corpus.metadata = True old_id2word = corpus.id2word corpus.id2word = model.id2word for doc, metadata in corpus: if by_ids is None or metadata[0] in by_ids: # get a vector where low topic values are zeroed out. topics = model[doc] if full: topics = sparse2full(topics, model.num_topics) # this gets the "full" vector that includes low topic values # topics = model.__getitem__(doc, eps=0) # topics = [val for id, val in topics] doc_topic.append((metadata, topics)) corpus.metadata = False corpus.id2word = old_id2word logger.info('Returning doc topic of length %d', len(doc_topic)) return doc_topic
def initialize(self, corpus, chunks = 100, keepDecomposition = False, dtype = numpy.float64): """ Run SVD decomposition on the corpus. This will define the latent space into which terms and documents will be mapped. The SVD is created incrementally, in blocks of `chunks` documents. In the end, a `self.projection` matrix is constructed that can be used to transform documents into the latent space. The `U, S, V` decomposition itself is discarded, unless `keepDecomposition` is True, in which case it is stored in `self.u`, `self.s` and `self.v`. The algorithm is adapted from: **M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition** """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") maxId = -1 for document in corpus: maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document])) self.numTerms = 1 + maxId self.id2word = dict(zip(xrange(self.numTerms), xrange(self.numTerms))) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # initialize decomposition (zero documents so far) self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics)), dtype = dtype) self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)), dtype = dtype) #self.v = numpy.matrix(numpy.zeros((0, self.numTopics)), dtype = dtype) self.v = None # do the actual work -- perform iterative singular value decomposition. # this is done by sequentially updating SVD with `chunks` new documents chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks) for chunkNo, (key, group) in enumerate(chunker): # convert the chunk of sparse documents to full vectors docs = [matutils.sparse2full(doc, self.numTerms) for docNo, doc in group] # self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents self.svdAddCols(docs, reorth = False) logging.info("processed documents up to #%s" % docNo) # calculate projection needed to get document-topic matrix from term-document matrix. # # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x, # so the projection is self.s^-1 * self.u^-1. # # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so # we pre-multiply v * s (ie., scale axes by singular values), and return # that directly as the representation of `x` in LSI space. # # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is # just self.u.T # # note that neither `v` (the right singular vectors) nor `s` (the singular # values) are used at all in the transformation self.projection = self.u.T if not keepDecomposition: # once we have the projection stored in self, discard u*s*v decomposition to free up memory del self.u, self.v
def get_features(article, extractor): ''' Reaturns full features vector from article. Article should be a mongodb model ''' #check if features of article are current version try: feature_version = article.features.version except AttributeError as e: if str(e) == 'features': logger.error("Article %s does not have any features." % article.id) #article seems not to exist anymore go on raise if feature_version != extractor.get_version(): clean_content = article.clean_content #get new features features = extractor.get_features(clean_content) else: features = article.features.data #sparse2full converts list of 2-tuples to numpy array article_features_as_full_vec = matutils.sparse2full(features, extractor.get_feature_number()) return article_features_as_full_vec
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunks=256): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.info("scanning corpus to determine the number of features") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunks = chunks if corpus is not None: logger.info("creating matrix for %s documents and %i features" % (len(corpus), num_features)) self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus))) self.index[docno] = matutils.unitvec(matutils.sparse2full(vector, num_features))
def transform(self, docs): """Infer the topic distribution for `docs`. Parameters ---------- docs : {iterable of list of (int, number), list of (int, number)} Document or sequence of documents in BoW format. Returns ------- numpy.ndarray of shape [`len(docs)`, `num_topics`] The topic distribution for each input document. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # The input as array of array if isinstance(docs[0], tuple): docs = [docs] # returning dense representation for compatibility with sklearn # but we should go back to sparse representation in the future distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] return np.reshape(np.array(distribution), (len(docs), self.num_topics))
def transform(self, docs): """ Takes a list of documents as input ('docs'). Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). The input `docs` should be in BOW format and can be a list of documents like [[(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)]] or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # The input as array of array if isinstance(docs[0], tuple): docs = [docs] distribution, max_num_topics = [], 0 for doc in docs: topicd = self.gensim_model[doc] distribution.append(topicd) max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1) # returning dense representation for compatibility with sklearn # but we should go back to sparse representation in the future distribution = [matutils.sparse2full(t, max_num_topics) for t in distribution] return np.reshape(np.array(distribution), (len(docs), max_num_topics))
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(25): # restart at most 5 times # create the transformation model model = self.class_(id2word=dictionary, num_topics=2, passes=100) model.update(self.corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose( sorted(vec), sorted(expected), atol=1e-1 ) # must contain the same values, up to re-ordering if passed: break logging.warning( "LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected)) ) self.assertTrue(passed)
def __getitem__(self, bow, scaled=False): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) assert self.projection.u is not None, "decomposition not initialized yet" vec = matutils.sparse2full(bow, self.numTerms).astype(self.projection.u.dtype) vec.shape = (self.numTerms, 1) assert self.projection.u.flags.f_contiguous dgemv = matutils.blas('gemv', self.projection.u) topicDist = dgemv(1.0, self.projection.u, vec, trans=True) # u^T * x if scaled: topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x nnz = topicDist.nonzero()[0] return zip(nnz, topicDist[nnz])
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
def testTransform(self): passed = False # sometimes, training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(25): # restart at most 5 times # create the transformation model model = self.class_(id2word=dictionary, num_topics=2, passes=100, random_state=0) model.update(corpus, author2doc) jill_topics = model.get_author_topics('jill') # NOTE: this test may easily fail if the author-topic model is altered in any way. The model's # output is sensitive to a lot of things, like the scheduling of the updates, or like the # author2id (because the random initialization changes when author2id changes). If it does # fail, simply be aware of whether we broke something, or if it just naturally changed the # output of the model slightly. vec = matutils.sparse2full(jill_topics, 2) # convert to dense vector, for easier equality tests expected = [0.91, 0.08] passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering if passed: break logging.warning( "Author-topic model failed to converge on attempt %i (got %s, expected %s)", i, sorted(vec), sorted(expected) ) self.assertTrue(passed)
def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec
def __getitem__(self, bow, eps=1e-12): """ Return esa representation of the input vector and/or corpus. bow should already be weights, e.g. with TF-IDF """ # if the input vector is in fact a corpus, return a transformed corpus # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) #use corpus as interpreter matrix #simply multiply feature vector of input with corpus matrix #to get the weight of the concept vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus) #normalize vector = matutils.unitvec(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(concept_id, weight) for concept_id, weight in enumerate(vector) if abs(weight) > eps] return vector
def testTransform(self): # create the transformation model model = lsimodel.LsiModel(self.corpus, numTopics=2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full( transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose( abs(vec), abs(expected))) # transformed entries must be equal up to sign
def search(self, query): query_repr = read_ap.process_text(query) vec_query = self.corpus.dictionary.doc2bow(query_repr) lda_query = sparse2full(self.model[vec_query], self.num_topics) results = defaultdict(float) for doc_id, lda_doc_repr in zip(self.corpus.doc_ids, self.lda_corpus_pers): results[doc_id] = kullback_leibler(lda_query, lda_doc_repr) results = { k: v for k, v in sorted( results.items(), key=lambda item: item[1], reverse=True) } return list(results.items())
def testTransform(self): # create the transformation model numpy.random.seed( 13 ) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results) model = rpmodel.RpModel(self.corpus, num_topics=2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full( transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.70710677, 0.70710677]) self.assertTrue(numpy.allclose( vec, expected)) # transformed entries must be equal up to sign
def __init__(self, dictionary=None, corpus=None, index_file=None, max_docs=None, **kwargs): Corpus.__init__(self, dictionary=dictionary, corpus=corpus) self.clip_corpus(max_docs) # Set up for KNN features = len(self.dictionary) self.index = AnnoyIndex(features) start_time = datetime.datetime.now() if not index_file: self.transform_corpus(models.TfidfModel) for i, vector in enumerate(self): self.index.add_item(i, list(sparse2full(vector, features).astype(float))) self.index.build(self.no_trees) else: self.index.load(index_file) end_time = datetime.datetime.now() self.train_time = end_time - start_time return
def testTransform(self): # create the transformation model passed = False numpy.random.seed(13) for i in xrange(10): # lda is randomized, so allow 10 iterations to test for equality model = ldamodel.LdaModel(self.corpus, numTopics = 2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.0, 1.0] passed = passed or numpy.allclose(sorted(vec), sorted(expected)) # must contain the same values, up to re-ordering if passed: break self.assertTrue(passed, "Error in randomized LDA test")
def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) vec = matutils.sparse2full(bow, self.num_terms).reshape( self.num_terms, 1) / numpy.sqrt(self.num_topics) vec = numpy.asfortranarray(vec, dtype=numpy.float32) topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1) return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
def test_transform_float32(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2, dtype=np.float32) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(np.allclose(s[:2], model.projection.s)) # singular values must match self.assertEqual(model.projection.u.dtype, np.float32) self.assertEqual(model.projection.s.dtype, np.float32) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = np.array([-0.6594664, 0.142115444]) # scaled LSI version # transformed entries must be equal up to sign self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1.e-5))
def testSparseTransform(self): if not self.mallet_path: return passed = False for i in range(5): # restart at most 5 times # create the sparse transformation model with the appropriate topic_threshold model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200, topic_threshold=0.5) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [1.0, 0.0] passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.info("scanning corpus to determine the number of features") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus is not None: logger.info("creating matrix for %s documents and %i features" % (len(corpus), num_features)) self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus))) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec( matutils.sparse2full(vector, num_features)) self.index[docno] = vector
def plot_author_clustering_interia(self, max_cluster=100, min_cluster=3): nips = self.model author_vecs = self.author_vecs X = [ matutils.sparse2full(author, nips.num_topics) for author in author_vecs ] inertianew = [] scaler = StandardScaler() scaler.fit(X) X_new = scaler.transform(X) for i in range(min_cluster, max_cluster): print('\nCreating K means clusters with cluters=%d' % i) kmeans = KMeans(n_clusters=i, random_state=0).fit(X_new) inertianew.append(kmeans.inertia_) plt.plot(list(range(min_cluster, max_cluster)), inertianew) plt.show()
def run_evaluation(classifiers, models, eval_samples): ln.info("Beginning evaluation") classifications = dict() for modelname, classifier in classifiers.items(): model = models[modelname] model_classifications = defaultdict(int) for sample_no, (eval_sample_text, actual_label) in enumerate(eval_samples): bow = dictionary.doc2bow(simple_preprocess(eval_sample_text)) model_features = sparse2full(model[bow], model.__out_size) predicted_label = classifier.predict(model_features)[0] model_classifications[(actual_label, predicted_label)] += 1 if sample_no % 500 == 0: ln.debug("Classifier for %s evaluated %s samples so far." % (modelname, sample_no)) classifications[modelname] = model_classifications ln.info("Finished evaluation") return classifications
def get_docs_emb_trained(docs: List[str], nlp: en.English, pattern_name: str, model_dir: str) -> List[str]: """ Gets list of GloVe vectors for each string in docs using TF-IDF model. Loads previously stored doc dictionary and TF-IDF model to determine BOW representation of corpus. Args: docs: list of strings to get embeddings for. nlp: pretrained spacy model to use for embedding vectors. pattern_name: regex pattern name that generates list of docs (only to be used for naming purposes of dictionary/model). model_dir: model directory to load dictionary and TF-IDF model from. Returns: List of GloVe embedded vectors. Has shape (len(docs), 300) """ loaded_dict = Dictionary.load(model_dir + pattern_name + '_dict.dict') model = pickle.load( open(model_dir + pattern_name + '_tfidf_model.sav', 'rb')) corpus = [loaded_dict.doc2bow(doc) for doc in docs] if corpus: docs_tfidf = model[corpus] # extracts vector representation for each document from bag of words docs_vecs = np.vstack( [sparse2full(c, len(loaded_dict)) for c in docs_tfidf]) # extracts nlp vector using pretrained model for each term in dictionary tfidf_emb_vecs = np.vstack( [nlp(loaded_dict[i]).vector for i in range(len(loaded_dict))]) # gets nlp vector embedding of each doc docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) return docs_emb # if corpus is empty according to dictionary return []
def __init__(self, corpus, numBest=None, dtype=numpy.float32, numFeatures=None): """ If `numBest` is left unspecified, similarity queries return a full list (one float for every document in the corpus, including the query document): If `numBest` is set, queries return `numBest` most similar documents, as a sorted list: >>> sms = MatrixSimilarity(corpus, numBest = 3) >>> sms[vec12] [(12, 1.0), (30, 0.95), (5, 0.45)] """ if numFeatures is None: logging.info( "scanning corpus of %i documents to determine the number of features" % len(corpus)) numFeatures = 1 + utils.getMaxId(corpus) logging.info("creating matrix for %i documents and %i features" % (len(corpus), numFeatures)) self.numFeatures = numFeatures self.numBest = numBest self.corpus = numpy.empty(shape=(len(corpus), numFeatures), dtype=dtype) self.normalize = True if corpus is not None: # iterate over corpus, populating the numpy matrix for docNo, vector in enumerate(corpus): if docNo % 1000 == 0: logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus))) vector = matutils.unitVec( matutils.sparse2full(vector, numFeatures)) self.corpus[docNo] = vector self.corpus = numpy.asmatrix(self.corpus)
def print_top_topics_of_year(self, year, sp=True): model = self.model doc_vecs = self.doc_vecs topic_labels = self.topic_labels df = self.df_papers sparse_vecs = np.array( [matutils.sparse2full(vec, model.num_topics) for vec in doc_vecs]) #tsne = TSNE(random_state=3211) #tsne_embedding = tsne.fit_transform(sparse_vecs) #years=df['year'].values #df_dist=pd.DataFrame({'year':years,'Topic_Distribution':sparse_vecs}) df_sp = pd.DataFrame(sparse_vecs) df_dist = df_sp[df['year'] == year] top_topic = df_dist.sum().idxmax() value = df_dist.sum()[top_topic] print('Top topic: %s' % topic_labels[top_topic]) print('Value: %f' % value) print('\n Following are the top words in the topic') print(self.model.show_topic(top_topic)) #data = df[df['year']<=year] # ax=df_dist.sum().plot(kind='bar') # ax.set_xticklabels(topic_labels, rotation=90) # plt.title('Topic Score for the year:%d' %year) # plt.show() # df = pd.DataFrame({ 'Topic': topic_labels, 'TopicDistribution': df_dist.sum().values }) output_file("toptpic.html") p = Bar(df, 'Topic', values='TopicDistribution', title="Bar Plot of Topic Distributions for the year %d" % year, color='green') if (sp): show(p) return df_dist
def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) if getattr(self, 'freshly_loaded', False): # This is a hack to work around a bug in numpy, where a FORTRAN-order array # unpickled from disk segfaults on using it. self.freshly_loaded = False self.projection = self.projection.copy('F') # simply making a fresh copy fixes the broken array vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics) vec = numpy.asfortranarray(vec, dtype=numpy.float32) topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1) return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
def get_user_latent_vector(user_action, path): """ get user latent vector Args: user_action: user action path: lda model path Return: user_latent_vector: {userid: np.ndarray([v1, v2, v3])} """ texts = [x[2] for x in user_action] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] if not os.path.exists(path): lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=F, id2word=dictionary) lda.save(path) else: lda = models.ldamodel.LdaModel.load(path) topics = lda.get_document_topics(corpus) user_latent_vector = {user_action[i][0]: sparse2full(topics[i], lda.num_topics) for i in range(len(texts))} return user_latent_vector
def __getitem__(self, bow, scaled = True): """ Return latent distribution, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) vec = matutils.sparse2full(bow, self.numTerms) vec.shape = (self.numTerms, 1) assert vec.dtype == numpy.float32 and self.projection.dtype == numpy.float32 topicDist = self.projection * vec if not scaled: topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def __getitem__(self, bow, scaled = False): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) assert self.projection.u is not None, "decomposition not initialized yet" vec = numpy.asfortranarray(matutils.sparse2full(bow, self.numTerms), dtype = self.projection.u.dtype) vec.shape = (self.numTerms, 1) topicDist = scipy.linalg.fblas.dgemv(1.0, self.projection.u, vec, trans = True) # u^T * x if scaled: topicDist = (1.0 / self.projection.s) * topicDist # s^-1 * u^T * x return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def analyze(self, sentence, nlp): parsed_sentence = tokenize(nlp(sentence)) #sent_corp = [self.text_dict.doc2bow(parsed_sentence)] sent_corp = [ get_sent_bow(parsed_sentence, self.text_dict, nlp, preload=self.preload) ] sent_tfidf = self.tfidf[sent_corp] sent_vec = np.vstack( [sparse2full(c, len(self.text_dict)) for c in sent_tfidf]) glove_vec = np.dot(sent_vec, self.tf_glove) try: red_glove = self.pca.transform(glove_vec) except ValueError: return glove_vec, sent_corp prob = self.classifier.predict_proba(red_glove) val = get_classification_values(red_glove, self.classifier)[0] return val
def testTransformSerialized(self): # Same as testTransform, using serialized corpora. passed = False # sometimes, training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(25): # restart at most 5 times # create the transformation model model = self.class_( id2word=dictionary, num_topics=2, passes=100, random_state=0, serialized=True, serialization_path=datapath('testcorpus_serialization.mm')) model.update(self.corpus, author2doc) jill_topics = model.get_author_topics('jill') # NOTE: this test may easily fail if the author-topic model is altered in any way. The model's # output is sensitive to a lot of things, like the scheduling of the updates, or like the # author2id (because the random initialization changes when author2id changes). If it does # fail, simply be aware of whether we broke something, or if it just naturally changed the # output of the model slightly. vec = matutils.sparse2full( jill_topics, 2) # convert to dense vector, for easier equality tests expected = [0.91, 0.08] passed = np.allclose( sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering # Delete the MmCorpus used for serialization inside the author-topic model. remove(datapath('testcorpus_serialization.mm')) if passed: break logging.warning( "Author-topic model failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(5): # restart at most 5 times # create the transformation model model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
def transform(self, docs): """ Takes a list of documents as input ('docs'). Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). The input `docs` should be in BOW format and can be a list of documents like [[(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)]] or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # The input as array of array if isinstance(docs[0], tuple): docs = [docs] # returning dense representation for compatibility with sklearn # but we should go back to sparse representation in the future distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] return np.reshape(np.array(distribution), (len(docs), self.num_topics))
def transform(self, docs): """ Takes a list of documents as input ('docs'). Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") # The input as array of array check = lambda x: [x] if isinstance(x[0], tuple) else x docs = check(docs) X = [[] for _ in range(0, len(docs))] for k, v in enumerate(docs): doc_topics = self.gensim_model[v] # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future probs_docs = matutils.sparse2full(doc_topics, self.num_topics) X[k] = probs_docs return np.reshape(np.array(X), (len(docs), self.num_topics))
def show_author_by_id(self, author): model = self.model topic_labels = self.topic_labels print('\n%s' % author) #print('Docs:', model.author2doc[author]) print('Topics:') print([(topic_labels[topic[0]], topic[1]) for topic in model[author]]) dist = matutils.sparse2full(model[author], model.num_topics) df = pd.DataFrame({'Topic': topic_labels, 'Score': dist}) #plt.plot(dist) # ax=df['Score'].plot(kind='bar') # ax.set_xticklabels(topic_labels, rotation=90) # output_file("AuthorTopicDistribution.html") # plt.show() p = Bar(df, 'Topic', values='Score', title="Bar Plot of Topic Distributions of %s" % self.get_author_name_from_id(author)) show(p) print(self.get_author_name_from_id(author))
def testTransform(self): # create the transformation model model = lsimodel.LsiModel(self.corpus, numTopics=2) # make sure the decomposition is enough accurate u, s, vt = numpy.linalg.svd(matutils.corpus2dense( self.corpus, self.corpus.numTerms), full_matrices=False) self.assertTrue(numpy.allclose( s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full( transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose( abs(vec), abs(expected))) # transformed entries must be equal up to sign
def transform(self, author_names): """ Return topic distribution for input authors as a list of (topic_id, topic_probabiity) 2-tuples. """ # The input as array of array if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) check = lambda x: [x] if not isinstance(x, list) else x author_names = check(author_names) X = [[] for _ in range(0, len(author_names))] for k, v in enumerate(author_names): transformed_author = self.gensim_model[v] # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future probs_author = matutils.sparse2full(transformed_author, self.num_topics) X[k] = probs_author return np.reshape(np.array(X), (len(author_names), self.num_topics))
def get_sims(model, query, corpus_full, dictionary, n_topics): ''' get ranking for single query ''' # avoid division by 0 eps = 1e-8 # process query query_processed = read_ap.process_text(query) query_bow = dictionary.doc2bow(query_processed) q_lda = sparse2full(model[query_bow], n_topics) q_lda += eps sims = [] # loop over all docs for i, doc in enumerate(corpus_full): doc += eps sim = -1 * kullback_leibler(q_lda, doc) sims.append(sim) sim_ordered = sorted(enumerate(sims), key=lambda item: -1 * item[1]) return sim_ordered
def __get_item__(self, val, rows = True, sparse = True, use_ids = None): if not sparse and use_ids != True: raise Exception("Cannot use original values if format is not sparse") size = -1 if rows: dct = self.row_tallies[val] lookup = lambda id: self.row_id_gen.get_key(id) size = self.row_count() else: dct = self.col_tallies[val] lookup = lambda id: self.col_id_gen.get_key(id) size = self.col_count() sitems = sorted(dct.items(), key = lambda (k,v): k) if not sparse: return matutils.sparse2full(sitems, size) if use_ids: return sitems else: return [(lookup(k), v) for k,v in sitems]
def getSimilarities(self, doc): """ Return similarity of sparse vector `doc` to all documents in the corpus. `doc` may be either a bag-of-words iterable (standard corpus document), or a numpy array, or a `scipy.sparse` matrix. """ if scipy.sparse.issparse(doc): vec = doc.toarray().flatten() elif isinstance(doc, numpy.ndarray): vec = doc else: vec = matutils.sparse2full(doc, self.numFeatures) vec = numpy.asfortranarray(vec, dtype=self.corpus.dtype).reshape( self.numFeatures, 1) # compute cosine similarity against every other document in the collection gemv = matutils.blas('gemv', self.corpus) allSims = gemv(1.0, self.corpus, vec) # N x T * T x 1 = N x 1 allSims = list(allSims.flat) # convert to plain python list assert len(allSims) == self.corpus.shape[ 0] # make sure no document got lost! return allSims
def transform(self, docs): """Infer a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). Parameters ---------- docs : {iterable of list of (int, number), list of (int, number)} Document or sequence of documents in BOW format. Returns ------- numpy.ndarray of shape [`len(docs), num_topics`] Topic distribution for `docs`. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # The input as array of array if isinstance(docs[0], tuple): docs = [docs] distribution, max_num_topics = [], 0 for doc in docs: topicd = self.gensim_model[doc] distribution.append(topicd) max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1) # returning dense representation for compatibility with sklearn # but we should go back to sparse representation in the future distribution = [ matutils.sparse2full(t, max_num_topics) for t in distribution ] return np.reshape(np.array(distribution), (len(docs), max_num_topics))