def do_classify(): corpus = MyCorpus() # tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] # corpus_lsi = lsi_model[corpus_idf] num_terms = len(corpus.dictionary) # num_terms = 400 corpus_sparse = matutils.corpus2csc(corpus_idf, num_terms).transpose(copy=False) # print corpus_sparse.shape # corpus_dense = matutils.corpus2dense(corpus_idf, len(corpus.dictionary)) # print corpus_dense.shape penalty = "l2" clf = SGDClassifier(loss="hinge", penalty=penalty, alpha=0.0001, n_iter=50, fit_intercept=True) # clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3) y = np.array(corpus.cls_y) # print y.shape clf.fit(corpus_sparse, y) filename = os.path.join(HERE, "sgdc_clf.pkl") _ = joblib.dump(clf, filename, compress=9) print "train completely" X_test = [] X_label = [] for obj in SogouCorpus.objects.filter(id__in=corpus.test_y): X_test.append(obj.tokens) X_label.append(cls_ids[obj.classify]) # result = classifier.predict(obj.tokens) test_corpus = [dictionary.doc2bow(s.split(",")) for s in X_test] test_corpus = tfidf_model[test_corpus] test_corpus = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False) pred = clf.predict(test_corpus) score = metrics.f1_score(X_label, pred) print ("f1-score: %0.3f" % score)
def evaluate_improved_cllsi(x_train1_in, x_test1_in, x_train2_in, x_test2_in, dimensions, evaluation_function): scores = [] for k in dimensions: x_train1, x_test1 = tfidf(data=(x_train1_in, x_test1_in)) x_train2, x_test2 = tfidf(data=(x_train2_in, x_test2_in)) n_train, n_test = len(x_train1), len(x_test1) X1 = matutils.corpus2csc(list(x_train1) + list(x_test1)) X2 = matutils.corpus2csc(list(x_train2) + list(x_test2)) x_train1, x_train2 = X1[:, :n_train], X2[:, :n_train] x_test1, x_test2 = X1[:, n_train:], X2[:, n_train:] x = sp.sparse.vstack([x_train1, x_train2]) x = matutils.Sparse2Corpus(x) lsa = models.LsiModel(x, num_topics=k) n = x_train1.shape[0] U = lsa.projection.u U1, U2 = U[:n, :], U[n:, :] p1, p2 = sp.sparse.csr_matrix( np.linalg.pinv(U1)), sp.sparse.csr_matrix(np.linalg.pinv(U2)) a1, a2 = np.dot(x_test1.T, p1.T).todense(), np.dot(x_test2.T, p2.T).todense() score = evaluation_function(a1, a2) scores.append(score) return scores
def get_similarities(self, query): """ Return similarity of sparse vector `query` to all documents in the corpus, as a numpy array. If `query` is a collection of documents, return a 2D array of similarities of each document in `query` to all documents in the corpus (=batch query, faster than processing each document in turn). **Do not use this function directly; use the self[query] syntax instead.** """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.T # convert documents=rows to documents=columns elif isinstance(query, numpy.ndarray): if query.ndim == 1: query.shape = (1, len(query)) query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T else: # default case: query is a single vector, in sparse gensim format query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype) # compute cosine similarity against every other document in the collection result = self.index * query.tocsc() # N x T * T x C = N x C if result.shape[1] == 1: # for queries of one document, return a 1d array result = result.toarray().flatten() else: # otherwise, return a 2d matrix (#queries x #index) result = result.toarray().T return result
def load_data(): df = pandas.read_csv('../data/train_clean2.csv') df1 = df[['qid1', 'question1']] df1.columns = ['qid', 'question'] df2 = df[['qid2', 'question2']] df2.columns = ['qid', 'question'] df_que = pandas.concat([df1, df2], ignore_index=True) df_que = df_que.drop_duplicates().fillna('').sort_values('qid') logger.info('df_que {}'.format(df_que.shape)) train_num = df_que.shape[0] df = pandas.read_csv('../data/test_clean2.csv') df1 = df[['question1']] df1.columns = ['question'] df2 = df[['question2']] df2.columns = ['question'] df_que2 = pandas.concat([df1, df2], ignore_index=True) df_que2 = df_que2.drop_duplicates().fillna('') logger.info('df_que2 {}'.format(df_que2.shape)) df_que2['qid'] = numpy.arange(df_que2.shape[0]) + df_que.shape[0] df_que = pandas.concat([df_que, df_que2], ignore_index=True) sentences = corpus_to_sentences(df_que['question']) logger.info('dict') dictionary = corpora.Dictionary(sentences) dictionary.save('./gensim.dict') dictionary.filter_extremes(no_below=2, no_above=1., keep_n=2000000) p = Pool() id_corpus = p.map(dictionary.doc2bow, sentences) p.close() p.join() with open('count_corpus_brown.pkl', 'wb') as f: pickle.dump(id_corpus, f, -1) count_mat = corpus2csc(id_corpus).T logger.info('count_mat {}'.format(count_mat.shape)) with open('count_mat_brown.pkl', 'wb') as f: pickle.dump(count_mat, f, -1) tfidf_instance = models.TfidfModel(id_corpus, normalize=False) tfidf_corpus = tfidf_instance[id_corpus] tfidf_mat = corpus2csc(tfidf_corpus).T logger.info('tfidf_mat {}'.format(tfidf_mat.shape)) with open('tfidf_mat_brown.pkl', 'wb') as f: pickle.dump(tfidf_mat, f, -1) logger.info('df_que {}'.format(df_que.shape)) logger.info('end load') return 0
def get_tfidf_scores(kwargs): tfidf_transformer = TfIdfTransformer(**kwargs).fit(train_corpus) X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T clf = LogisticRegression().fit(X_train_tfidf, y_train) model_accuracy = clf.score(X_test_tfidf, y_test) doc_scores = clf.decision_function(X_test_tfidf) return model_accuracy, doc_scores
def sim_rank(self, query, doc_corpus, query_norm=True, doc_norm=True, res_sort=True): assert isinstance(doc_corpus, (tuple, list)) and len(doc_corpus) > 0 if not isinstance(doc_corpus[0], (tuple, list)): doc_corpus = [doc_corpus] corpus_vec = [self.text2vec(c, norm=doc_norm) for c in doc_corpus] doc_index = matutils.corpus2csc(corpus_vec, num_terms=self.dict_size, dtype=np.float32).T.tocsr() query_vec = self.text2vec(query, norm=query_norm) query_index = matutils.corpus2csc([query_vec], num_terms=self.dict_size, dtype=doc_index.dtype).tocsr() sim_array = doc_index * query_index sims = sim_array.toarray().T[0].tolist() # sims = doc_index[query_vec] return sorted(list(enumerate(sims)), key=lambda p: p[1], reverse=True) if res_sort else sims
def get_similarities(self, query): """Get similarity between `query` and this index. Warnings -------- Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`} Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix (if maintain_sparsity=False) **OR** :class:`scipy.sparse.csc` otherwise """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.T # convert documents=rows to documents=columns elif isinstance(query, numpy.ndarray): if query.ndim == 1: query.shape = (1, len(query)) query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T else: # default case: query is a single vector, in sparse gensim format query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype) # compute cosine similarity against every other document in the collection result = self.index * query.tocsc() # N x T * T x C = N x C if result.shape[1] == 1 and not is_corpus: # for queries of one document, return a 1d array result = result.toarray().flatten() elif self.maintain_sparsity: # avoid converting to dense array if maintaining sparsity result = result.T else: # otherwise, return a 2d matrix (#queries x #index) result = result.toarray().T return result
def Kmeans(n_clusters, args): corpus = corpora.MmCorpus(args[1]) tfidf = models.tfidfmodel.TfidfModel.load(args[2]) logging.info('finished loading corpus and tfidf models') # load the dictionary and corpus dictionary = corpora.Dictionary.load(args[0]) tfidfValue = tfidf[corpus] logging.info('finished tfidfValue = tfidf[corpus]') fitdata = matutils.corpus2csc(tfidfValue, num_terms=len(dictionary), dtype='float32', num_docs=len(tfidfValue), num_nnz=None, printprogress=0).transpose() logging.info( 'finished transfer tfidf vector to sparse vector and transpose sparse vector' ) logging.info('begin kmeans fit') #TODO refactor parameters km = KMeans(n_clusters=n_clusters, init='random', max_iter=100, n_init=5, verbose=1) km.fit(fitdata) logging.info('finished kmeans fit') # l = open('./TFIDF_KmeansTrend.txt','w') # for label in km.labels_: # l.write(str(label)+'\n') # l.close() return km.labels_
def get_labelled_csc_corpus(): lsi_300_corpus = gensim.corpora.MmCorpus( '../content-preprocessor/lsi_300_corpus.mm') labelled_df, labelled_indices, y_matrix = get_labelled_stories() labelled_lsi_corpus = lsi_300_corpus[labelled_indices] labelled_lsi_csc_corpus = corpus2csc(has_tags_lsi_corpus) return labelled_lsi_csc_corpus, y_matrix
def reduce_nlp_data(vectorizer, data, n_components, reducer): transformed_data = vectorizer.fit_transform(data) id2word = { identifier: word for word, identifier in vectorizer.vocabulary_.items() } if reducer == 'lda': corpus = matutils.Sparse2Corpus(transformed_data.transpose()) lda = models.LdaModel(corpus=corpus, num_topics=n_components, minimum_probability=0.03, id2word=id2word, passes=10, random_state=42) print(lda.print_topics()) lda_corpus = lda[corpus] return lda, matutils.corpus2csc(lda_corpus).toarray().transpose() elif reducer == 'svd': SVD = TruncatedSVD(n_components, n_iter=10, random_state=42) svd_data = SVD.fit_transform(transformed_data) get_eigenvectors(SVD, id2word) return SVD, svd_data elif reducer == 'nmf': nmf = NMF(n_components, random_state=42) nmf_data = nmf.fit_transform(transformed_data) get_eigenvectors(nmf, id2word) return nmf, nmf_data else: return None, None
def fit(self, corpus): self._verify_corpus(corpus) self.N = len(corpus) tokens = self.preprocessor.transform(corpus) self.observed_tokens = tokens.apply(len).sum() vocab = Dictionary(tokens) vocab.filter_extremes(no_above=self.max_df, no_below=self.min_df, keep_n=self.vocab_size) vocab.compactify() self.vocab = vocab self.corpus_as_tokens = tokens self.corpus_as_bow = [self.vocab.doc2bow(doc) for doc in tokens] self.corpus_as_csr = corpus2csc(self.corpus_as_bow, num_terms=len(self.vocab)).T self.lengths = [len(d) for d in self.corpus_as_bow] self.num_empty_docs = self.lengths.count(0) time_now = time.localtime() self.created_on = time.strftime("%d %b %Y %H:%M:%S", time_now) return self
def process_data(X): """ :param X: X is a pandas DataFrame of tweets (with no labels) :return: The feature matrix where each row is a tweet and each column is a feature. Ready to train/predict. """ tweets = process_texts(X) dictionary = corpora.Dictionary.load('dictionary.dict') lsi_model = models.LsiModel.load('lsi.model') # Transform each tweet (a string) to a row of bag of words vector in the corpus matrix. corpus = [dictionary.doc2bow(tweet.split()) for tweet in tweets] # transform the bag of words corpus matrix to LSI matrix of features. # To read more about LSI - https://en.wikipedia.org/wiki/Latent_semantic_analysis feature_mat = corpus2csc(lsi_model[corpus]).T.toarray() # Added features # feature_mat = count_occ('!', X) feature_mat = np.hstack((feature_mat, count_occ('!', X))) feature_mat = np.hstack((feature_mat, count_occ('@', X))) feature_mat = np.hstack((feature_mat, count_occ('?', X))) feature_mat = np.hstack((feature_mat, count_occ('❤', X))) feature_mat = np.hstack((feature_mat, count_occ('#', X))) feature_mat = np.hstack((feature_mat, count_occ('/https:', X))) feature_mat = np.hstack((feature_mat, chars_per_tweet(X))) feature_mat = np.hstack((feature_mat, words_per_tweet(X))) # feature_mat = zscore(feature_mat, axis=1) return feature_mat, feature_mat.shape[1]
def condensify(train): """ Takes input either a string or a list of string Returns a list of all summaries; For a string returns a list with singleton document """ summ_list = [] if isinstance(train,string): train = [train] for t in train: summ=[] k=0 #corpus = [dictionary.doc2bow(text) for text in texts] dictionary = corpora.Dictionary([w for w in reuters.sents(t)]) corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)] matrix = matutils.corpus2csc(corpus) #print matrix u,sigma,vt = sparse.linalg.svds(matrix) (k,l)= vt.shape while k>=1: if reuters.sents(t)[vt[k-1].argmax()] not in summ: summ.append(reuters.sents(t)[vt[k-1].argmax()]) k-=1 v=[] for s in summ: v.append(" ".join(s)) summ = "".join(v) summ_list.append(summ) return (summ_list)
def TF_IDF_Reg(corpus_list): tfidf_model = models.TfidfModel(corpus_list, normalize=True) courpus_list_tfidf = tfidf_model[corpus_list] word_matrix = matutils.corpus2csc(courpus_list_tfidf) return word_matrix
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec( matutils.corpus2csc([doc], self.num_features).T) else: doc = matutils.unitvec( matutils.sparse2full(doc, self.num_features)) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix): corpus = matutils.Dense2Corpus(numpy_matrix) numpy_matrix = matutils.corpus2dense(corpus, num_terms=number_of_corpus_features) corpus = matutils.Sparse2Corpus(scipy_sparse_matrix) scipy_csc_matrix = matutils.corpus2csc(corpus)
def _setup(self, corpus): """Infer info from the first document and initialize matrices. Parameters ---------- corpus : iterable of list of (int, float), optional Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). """ self._h = None if isinstance(corpus, scipy.sparse.csc.csc_matrix): first_doc = corpus.getcol(0) else: first_doc_it = itertools.tee(corpus, 1) first_doc = next(first_doc_it[0]) first_doc = matutils.corpus2csc([first_doc], len(self.id2word)) self.w_std = np.sqrt(first_doc.mean() / (self.num_tokens * self.num_topics)) self._W = np.abs( self.w_std * halfnorm.rvs( size=(self.num_tokens, self.num_topics), random_state=self.random_state ) ) self.A = np.zeros((self.num_topics, self.num_topics)) self.B = np.zeros((self.num_tokens, self.num_topics))
def get_message_lsi_embedding_vector(self, message): test_corpus = [self.dictionary.doc2bow(message.split())] test_corpus_tfidf = self.tfidf[test_corpus] test_lsi = self.lsi[test_corpus_tfidf] test_vector = matutils.corpus2csc(test_lsi) message_array = test_vector.toarray().reshape(-1, 1).T return message_array
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc( corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def __init__(self, corpus, num_best=None, chunks=500, dtype=numpy.float32, num_terms=None, num_docs=None, num_nnz=None): self.num_best = num_best self.normalize = True self.chunks = chunks if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass self.index = matutils.corpus2csc((matutils.unitvec(vector) for vector in corpus), num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=numpy.float32, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR logger.info("created %r" % self.index)
def _setup(self, corpus): """Infer info from the first document and initialize matrices. Parameters ---------- corpus : iterable of list(int, float) Training corpus. """ self._h, self._r = None, None first_doc_it = itertools.tee(corpus, 1) first_doc = next(first_doc_it[0]) first_doc = matutils.corpus2csc([first_doc], len(self.id2word)) self.w_std = np.sqrt(first_doc.mean() / (self.num_tokens * self.num_topics)) self._W = np.abs( self.w_std * halfnorm.rvs( size=(self.num_tokens, self.num_topics), random_state=self.random_state ) ) is_great_enough = self._W > self.w_std * self.sparse_coef self._W *= is_great_enough | ~is_great_enough.all(axis=0) self._W = scipy.sparse.csc_matrix(self._W) self.A = scipy.sparse.csr_matrix((self.num_topics, self.num_topics)) self.B = scipy.sparse.csc_matrix((self.num_tokens, self.num_topics))
def getLDaTopics(self, doc, number_of_topics, passe=20, iters=100, chunk=2000, gram=(1, 2), option='c'): dictionary, doc_term_matrix = self.prepare_corpus(doc, gram, option) # generate LDA model lda = models.LdaModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=number_of_topics, iterations=iters, passes=passe, chunksize=chunk, random_state=1) # train model #print(ldamodel.print_topics(),'\n') display(lda.print_topics()) corpus_transformed = lda[doc_term_matrix] all_topics_csr = matutils.corpus2csc(corpus_transformed) all_topics_numpy = all_topics_csr.T.toarray() #Lda_Topic=pd.DataFrame(all_topics_numpy) Lda_Topic = pd.DataFrame(all_topics_numpy, doc) display(Lda_Topic.head(5)) print('shape ', Lda_Topic.shape) return Lda_Topic
def __init__(self, m, k, docs = None): """ Store (U, S) projection itself. This is the class taking care of 'core math'; interfacing with corpora, training etc is done through class LsiModel. `docs` is either a spare matrix or a corpus which, when converted to a sparse matrix, must fit comfortably into main memory. """ self.m, self.k = m, k if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition # in core, algorithm 1 if utils.isCorpus(docs): docs = matutils.corpus2csc(m, docs) if m * k < 10000: # SVDLIBC gives spurious results for small matrices.. run full # LAPACK svd on them instead docs = docs.todense() logger.info("computing dense SVD of %s matrix" % str(docs.shape)) u, s, vt = numpy.linalg.svd(docs, full_matrices = False) else: try: import sparsesvd except ImportError: raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut del vt k = clipSpectrum(s, self.k) self.u, self.s = u[:, :k], s[:k] else: self.u, self.s = None, None
def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS): """ Construct the (U, S) projection from a corpus `docs`. The projection can be later updated by merging it with another Projection via `self.merge()`. This is the class taking care of the 'core math'; interfacing with corpora, splitting large corpora into chunks and merging them etc. is done through the higher-level `LsiModel` class. """ self.m, self.k = m, k self.power_iters = power_iters self.extra_dims = extra_dims if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition, # *in-core*. if not use_svdlibc: u, s = stochastic_svd(docs, k, chunksize=sys.maxsize, num_terms=m, power_iters=self.power_iters, extra_dims=self.extra_dims) else: try: import sparsesvd except ImportError: raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) if not scipy.sparse.issparse(docs): docs = matutils.corpus2csc(docs) ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut, vt k = clip_spectrum(s**2, self.k) self.u = u[:, :k].copy() self.s = s[:k].copy() else: self.u, self.s = None, None
def _doc_doc_mtx(table, model, input_col, result_type='sparse'): corpus = table[input_col].tolist() dictionary = model['dictionary'] bow_corpus = [] for doc in corpus: bow_corpus.append(dictionary.doc2bow(doc)) csr_matrix = matutils.corpus2csc(bow_corpus).T csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))]) doc_doc = (csr_matrix @ (csr_matrix.T)).tocoo() if result_type == 'sparse': doc_doc = sparse.triu(doc_doc, k=1) out_table = pd.DataFrame(doc_doc.row, columns=['1st_document_idx']) out_table['2nd_document_idx'] = doc_doc.col out_table['number_of_common_terms'] = doc_doc.data elif result_type == 'dense': doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))] out_table = pd.DataFrame(doc_doc.todense()) out_table.insert(loc=0, column=' ', value=doc_idx) out_table.columns = np.append("", doc_idx) else: raise_runtime_error("Please check 'result_type'.") rb = BrtcReprBuilder() model = _model_dict('doc_doc_mtx') model['input_col'] = input_col model['doc_doc_mtx'] = doc_doc model['_repr_brtc_'] = rb.get() return {'out_table': out_table}
def classify_content(content): num_terms = len(dictionary) test_corpus = tfidf_model[dictionary.doc2bow(list(Tokenize(content)))] test_sparse = matutils.corpus2csc([test_corpus], num_terms).transpose(copy=False) result = sg_class.predict(test_sparse) return id2cls[result[0]]
def train(): with open('count_corpus.pkl', 'rb') as f: id_corpus = pickle.load(f) #lda = models.ldamulticore.LdaMulticore(corpus=id_corpus, num_topics=50) # lda.save('lda.model') p = Pool() aaa = p.map(pred, id_corpus) result = numpy.asarray(corpus2csc(aaa).T.todense()) p.close() p.join map_train, map_test, train_num = make_idmap() df = pandas.read_csv('../data/train_clean.csv')[['question1', 'question2' ]].fillna('').values df_train = pandas.DataFrame(_train(result[:train_num], df, map_train)) df_train.to_csv('lda_train.csv', index=False) df = pandas.read_csv('../data/test_clean.csv')[['question1', 'question2' ]].fillna('').values df_test = pandas.DataFrame(_train(result[train_num:], df, map_test)) df_test.to_csv('lda_test.csv', index=False)
def tfidf(counts_mat): m,n = counts_mat.shape # utilize gensim corpus = matutils.Sparse2Corpus(counts_mat) tfidf = models.logentropy_model.LogEntropyModel(corpus) c_tfidf = tfidf[corpus] return matutils.corpus2csc(c_tfidf)
def process_records(records, fields, target, textmodel=None): tokenize = CountVectorizer().build_analyzer() input = None X = None y_labels = [] for i, record in enumerate(records): nums = [] strs = [] y_labels.append(record.get(target)) for field in fields: if is_number(record.get(field)): nums.append(record[field]) else: strs.append(str(record.get(field) or "").lower()) if strs: if input is None: input = StringIO.StringIO() print >> input, " ".join(tokenize(" ".join(strs))) if nums: if X is None: X = sp.lil_matrix((len(records),len(nums))) X[i] = np.array(nums, dtype=np.float64) if input is not None: if X is not None: X_2 = X.tocsr() else: X_2 = None if isinstance(textmodel,basestring): if textmodel == 'lsi': corpus = TextCorpus(input) textmodel = LsiModel(corpus, chunksize=1000) elif textmodel == 'tfidf': corpus = TextCorpus(input) textmodel = TfidfModel(corpus) elif textmodel == 'hashing': textmodel = None hasher = FeatureHasher(n_features=2 ** 18, input_type="string") input.seek(0) X = hasher.transform(tokenize(line.strip()) for line in input) if textmodel: num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[])) X = corpus2csc(textmodel[corpus], num_terms).transpose() if X_2 is not None: # print >> sys.stderr, "X SHAPE:", X.shape # print >> sys.stderr, "X_2 SHAPE:", X_2.shape X = sp.hstack([X, X_2], format='csr') elif X is not None: textmodel = None X = X.tocsr() print >> sys.stderr, "X SHAPE:", X.shape return X, y_labels, textmodel
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform 256 documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # 256 smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] vec = matutils.corpus2csc(bow, num_terms=self.num_terms) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def condensify(train): """ Takes input either a string or a list of string Returns a list of all summaries; For a string returns a list with singleton document """ summ_list = [] if isinstance(train, string): train = [train] for t in train: summ = [] k = 0 #corpus = [dictionary.doc2bow(text) for text in texts] dictionary = corpora.Dictionary([w for w in reuters.sents(t)]) corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)] matrix = matutils.corpus2csc(corpus) #print matrix u, sigma, vt = sparse.linalg.svds(matrix) (k, l) = vt.shape while k >= 1: if reuters.sents(t)[vt[k - 1].argmax()] not in summ: summ.append(reuters.sents(t)[vt[k - 1].argmax()]) k -= 1 v = [] for s in summ: v.append(" ".join(s)) summ = "".join(v) summ_list.append(summ) return (summ_list)
def train(args, output_dir): """Build the corpus, trains the DTM, and saves the model to the output dir.""" corpus = Corpus() # Create the dictionary. dictionary = Dictionary(corpus.debates.bag_of_words) dictionary.filter_extremes(no_below=100) # Save empirical term distribution within each time step. term_counts = corpus2csc( corpus.debates.groupby('year').agg({ 'bag_of_words': 'sum' }).bag_of_words.apply(dictionary.doc2bow)) save_npz(os.path.join(output_dir, 'term_counts.npz'), term_counts) # Train and save dtm. time_slices = corpus.debates.groupby('year').size() dtm_corpus = corpus.debates.bag_of_words.apply(dictionary.doc2bow) model = Dtm(args.executable, corpus=dtm_corpus, id2word=dictionary, num_topics=args.num_topics, time_slices=time_slices.values, rng_seed=args.random_seed) model.save(os.path.join(output_dir, 'dtm.gensim'))
def _transform(self, corpus, source_dict=None): temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True)) dic = corpora.Dictionary( temp_corpus, prune_at=None) if not source_dict else source_dict temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus] model = models.TfidfModel(temp_corpus, normalize=False, wlocal=self.wlocals[self.wlocal], wglobal=self.wglobals[self.wglobal]) X = matutils.corpus2csc(model[temp_corpus], dtype=np.float, num_terms=len(dic)).T norm = self.norms[self.norm] if norm: X = norm(X) # set compute values shared_cv = SharedTransform(self, corpus.used_preprocessor, source_dict=dic) cv = [ VectorizationComputeValue(shared_cv, dic[i]) for i in range(len(dic)) ] corpus = self.add_features(corpus, X, dic, cv, var_attrs={'bow-feature': True}) return corpus
def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000): is_corpus, current_representation = utils.is_corpus(current_representation) if is_corpus: for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)): ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk)) assert num_terms is not None, "Need num_terms to properly handle sparse corpus format" chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms) ln.debug("Chunk converted to csc, running through layer..") chunk_trans = layer.__getitem__(chunk_as_csc) ln.debug("Serializing hidden representation..") fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no)) np.save(fname, chunk_trans) ln.debug("Finished serializing chunk. Processed %s documents so far." % (chunk_no * chunksize + len(chunk))) else: ln.info("Beginning serialization of non-gensim corpus format intermediate representation.") ln.debug("Type of current_representation is %s" % type(current_representation)) for chunk_no, chunk in enumerate(current_representation): ln.debug("converting chunk (%s documents)..." % chunksize) chunk_trans = layer.__getitem__(chunk) ln.debug("Serializing hidden representation..") fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no)) np.save(fname, chunk_trans) ln.debug("finished serializing chunk.") ln.info("Finished serializing all chunks.")
def _get_vectors(self): processed_corpus = [self._tokenize(doc["document"], self.phraser) for doc in self.docs] self.dictionary = corpora.Dictionary(processed_corpus) # ignore 20% most frequent words # num_unique_words = len(dictionary) # dictionary.filter_n_most_frequent(int(num_unique_words*0.2)) # do some more filtering and keep only n most frequent specified with num_dims parameter self.dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=self.num_dims) bow_corpus = [self.dictionary.doc2bow(text) for text in processed_corpus] if self.vectorizer == "TfIdf Vectorizer": self.tfidf_model = models.TfidfModel(bow_corpus) transformed_corpus = self.tfidf_model[bow_corpus] elif self.vectorizer == "Count Vectorizer": transformed_corpus = bow_corpus if self.use_lsi: self.lsi_model = models.LsiModel(transformed_corpus, id2word=self.dictionary, num_topics=self.num_topics) transformed_corpus = self.lsi_model[transformed_corpus] matrix = corpus2csc(transformed_corpus, num_terms=len(self.dictionary.keys()), num_docs=self.dictionary.num_docs) return matrix.transpose()
def get_intracluster_similarity(self, new_documents=[], phraser=None): if len(new_documents) > 0: dictionary = self.models["dictionary"] for doc in new_documents: processed_text = Clustering._tokenize(doc["text"], phraser=phraser) doc_vec = [dictionary.doc2bow(processed_text)] if self.models["tfidf_model"] is not None: doc_vec = self.models["tfidf_model"][doc_vec] if self.models["lsi_model"] is not None: doc_vec = self.models["lsi_model"][doc_vec] full_vec = corpus2csc(doc_vec, num_terms=len(dictionary.keys()), num_docs=dictionary.num_docs) full_vec = full_vec.transpose() self.models["doc_vectors"][doc["id"]] = full_vec[0].toarray()[0] self._save_updated_models() if self.doc_ids: cluster_vectors = [] for doc_id in self.doc_ids: cluster_vectors.append(self.models["doc_vectors"][doc_id]) similarities = cosine_similarity(cluster_vectors) return np.mean(similarities) else: return 0
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
def update(self, corpus): """Train the model with new documents. Parameters ---------- corpus : iterable of list of (int, float), optional Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). """ if self._W is None: self._setup(corpus) chunk_idx = 1 for _ in range(self.passes): if isinstance(corpus, scipy.sparse.csc.csc_matrix): grouper = ( corpus[:, col_idx:col_idx + self.chunksize] for col_idx in range(0, corpus.shape[1], self.chunksize) ) else: grouper = utils.grouper(corpus, self.chunksize) for chunk in grouper: if isinstance(corpus, scipy.sparse.csc.csc_matrix): v = chunk[:, self.random_state.permutation(chunk.shape[1])] else: self.random_state.shuffle(chunk) v = matutils.corpus2csc( chunk, num_terms=self.num_tokens, ) self._h = self._solveproj(v, self._W, h=self._h, v_max=self.v_max) h = self._h self.A *= chunk_idx - 1 self.A += h.dot(h.T) self.A /= chunk_idx self.B *= chunk_idx - 1 self.B += v.dot(h.T) self.B /= chunk_idx prev_w_error = self._w_error self._solve_w() if chunk_idx % self.eval_every == 0: logger.info("Loss: {}".format(self._w_error / prev_w_error)) chunk_idx += 1 logger.info("Loss: {}".format(self._w_error / prev_w_error))
def main(): parser = argparse.ArgumentParser( description= 'calculates various stats and of a given document-author-contribs file' ) parser.add_argument( '--acc-contribs', type=argparse.FileType('r'), help='path to input MatrixMarket acc contributions file (.mm/.mm.bz2)', required=True) parser.add_argument('--img-prefix', help='prefix of output generated img files', required=True) parser.add_argument('--quantile-order', type=float, help='quantile of histrograms to consider', required=True) args = parser.parse_args() input_acc_contribs_dump_path = args.acc_contribs.name output_image_prefix = args.img_prefix quantile_order = args.quantile_order logger.info('running with:\n{}'.format( pformat({ 'input_acc_contribs_dump_path': input_acc_contribs_dump_path, 'output_image_prefix': output_image_prefix, 'quantile_order': quantile_order }))) acc_contribs = MmCorpus(input_acc_contribs_dump_path) logger.info('reading corpus to sparse csr matrix') csr_corpus = corpus2csc(acc_contribs).T.tocsr() logger.info('generated sparse matrix of shape {}'.format(csr_corpus.shape)) logger.debug('sparse matrix \n{}'.format(csr_corpus)) logger.info('calculating authors-per-docs-distribution') num_authors_per_doc = sp.find((csr_corpus > 0).sum(1))[2] quantile = get_quantile(num_authors_per_doc, quantile_order) num_authors_per_doc = num_authors_per_doc[num_authors_per_doc <= quantile] num_authors_per_doc_imgfile = output_image_prefix + '-num-auths-per-doc-dist.pdf' xlabel = 'Autoren je Dokument' ylabel = 'Häufigkeit' num_authors, num_authors_counts = np.unique(num_authors_per_doc, return_counts=True) bar_plot(num_authors, num_authors_counts, num_authors_per_doc_imgfile, xlabel, ylabel) logger.info('calculating docs-per-authors-distribution') num_docs_per_author = sp.find((csr_corpus > 0).sum(0).T)[2] quantile = get_quantile(num_docs_per_author, quantile_order) num_docs_per_author = num_docs_per_author[num_docs_per_author <= quantile] num_docs_per_author_imgfile = output_image_prefix + '-num-docs-per-auth-dist.pdf' xlabel = 'Dokumente je Autor' ylabel = 'Häufigkeit' num_docs, num_docs_counts = np.unique(num_docs_per_author, return_counts=True) bar_plot(num_docs, num_docs_counts, num_docs_per_author_imgfile, xlabel, ylabel)
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None): if not matutils.ismatrix(corpus): corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary)) else: corpus_csc = corpus # Need corpus to be a streaming gensim list corpus for len and inference functions below: corpus = matutils.Sparse2Corpus(corpus_csc) beta = 0.01 fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_) term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort] term_freqs[term_freqs == 0] = beta doc_lengths = corpus_csc.sum(axis=0).A.ravel() assert term_freqs.shape[0] == len( dictionary ), 'Term frequencies and dictionary have different shape {} != {}'.format( term_freqs.shape[0], len(dictionary)) assert doc_lengths.shape[0] == len( corpus ), 'Document lengths and corpus have different sizes {} != {}'.format( doc_lengths.shape[0], len(corpus)) if hasattr(topic_model, 'lda_alpha'): num_topics = len(topic_model.lda_alpha) else: num_topics = topic_model.num_topics if doc_topic_dists is None: # If its an HDP model. if hasattr(topic_model, 'lda_beta'): gamma = topic_model.inference(corpus) else: gamma, _ = topic_model.inference(corpus) doc_topic_dists = gamma / gamma.sum(axis=1)[:, None] else: if isinstance(doc_topic_dists, list): doc_topic_dists = matutils.corpus2dense(doc_topic_dists, num_topics).T elif issparse(doc_topic_dists): doc_topic_dists = doc_topic_dists.T.todense() doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1) assert doc_topic_dists.shape[ 1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format( doc_topic_dists.shape[1], num_topics) # get the topic-term distribution straight from gensim without # iterating over tuples if hasattr(topic_model, 'lda_beta'): topic = topic_model.lda_beta else: topic = topic_model.state.get_lambda() topic = topic / topic.sum(axis=1)[:, None] topic_term_dists = topic[:, fnames_argsort] assert topic_term_dists.shape[0] == doc_topic_dists.shape[1] return doc_topic_dists
def transform(self, X): x_clean = tu.clean_and_tokenize(X, stoplist=self.stoplist) x_tfidf = self.tfidf[[self.dictionary.doc2bow(text) for text in x_clean]] x_data = matutils.corpus2csc(x_tfidf, num_terms=len(self.dictionary)).T #x_data = matutils.corpus2dense(x_tfidf, num_terms=len(self.dictionary)).T #logging.info("Returning data of shape %s " % (len(x_data))) #returning a csr matrix return x_data
def setup(files): # setup the output directory base_model_name = os.path.splitext(os.path.basename(files.model))[0] output_dir = '../browser/json/' + base_model_name + '/' if not os.path.exists(output_dir): os.makedirs(output_dir) # load the topic model model = LdaModel.load(files.model) # load replacements used bug_to_id = json.loads(open(files.replacements).read()) # invert to id<->bug map, ditching s. genus terms id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k} # load the docsXwords and docsXtopics matrices (in sparse format) corpus = mmcorpus.MmCorpus(files.corpus) docsXwords_sparse = corpus2csc(corpus, num_terms=len(model.id2word.token2id)).T docsXtopics = mmcorpus.MmCorpus(files.docsXtopics) docsXtopics_sparse = corpus2csc(docsXtopics).T return docsXtopics_sparse, docsXwords_sparse, id_to_bug, model, output_dir
def hierarchical_clustering(corpus_fn, n_clusters=2, linkage='complete'): corpus = corpora.MmCorpus(corpus_fn) corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose() svd = TruncatedSVD(n_components=100) new_corpus = svd.fit_transform(corpus) knn_graph = kneighbors_graph(new_corpus, 10, metric='euclidean') agg = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=linkage, connectivity=knn_graph) agg.fit(new_corpus) return corpus, agg.labels_
def convert_to_X_y(model_class, params, data, label): model = model_class(**params) mat = model[data] X = corpus2csc(mat) y = np.array(label) return X.T, y
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. If `scaled` is set, scale topics by the inverse of singular values (default: no scaling). """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), # but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec) # # use np's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype)) if not is_corpus: # convert back from matrix into a 1d vec topic_dist = topic_dist.reshape(-1) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def readTest(self, test_deals_file): """ read test data @param test_deals @return """ corpus = [self.dict.doc2bow(line.split()) for line in open(test_deals_file, 'r')] self.X_test = matutils.corpus2csc(corpus, num_terms=len(self.dict)).T #joblib.dump(self.X_test, 'task3_X_test.pkl') print 'readTest Done!'
def load_sparse_dataset(colname,suffix): corpus = corpora.MmCorpus('vsm/{}.{}'.format(colname,suffix)) # with documents as columns X = matutils.corpus2csc(corpus,printprogress=1) # transpose to make each document a row X = X.T y = pd.read_csv("vsm/{}_meta.csv".format(colname),index_col='id') y = y.iloc[:,0]# DataFrame to Series return X,y
def evaluate(corpus_fn, labels_fn): corpus = corpora.MmCorpus(corpus_fn) labels = pickle.load(open(labels_fn)) corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose() scores = [] for i in range(16): score = metrics.silhouette_score(corpus, labels, metric='cosine', sample_size=5000) print score scores.append(score) sc = numpy.array(scores).mean() print 'mean', sc
def load_tfidf(tag): corpus = corpora.MmCorpus("{}.tfidf".format(tag)) # with documents as columns X = matutils.corpus2csc(corpus, printprogress=1) # transpose to make each document a row X = X.T y = pd.read_csv("{}_labels.csv".format(tag), index_col="date") y = y.iloc[:, 0] # DataFrame to Series return X, y
def key_words(keys, topk=18): vec_bow = dictionary.doc2bow(keys) tfidf_corpus = tfidf_model[vec_bow] num_terms = len(dictionary) test_sparse = matutils.corpus2csc([tfidf_corpus], num_terms).transpose(copy=False) result = sg_class.predict(test_sparse) words = [dictionary[d].decode("utf-8") for d, _ in sorted(list(tfidf_corpus), key=lambda item: -item[1])[0:topk]] classify = id2cls[result[0]] return (words, classify)
def transformed_corpus(): for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)): ln.debug("Converting chunk %s to csc format.." % chunk_no) chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality) ln.debug("Computing hidden representation for chunk.. ") hidden = self._get_hidden_representations(chunk) ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" % (chunk_no, chunk_no * chunksize + len(doc_chunk))) for column in hidden.T: yield matutils.dense2vec(column.T) ln.debug("Done yielding chunk %s" % chunk_no) ln.info("Finished computing representations for all chunks.")
def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS, dtype=np.float64): """Construct the (U, S) projection from a corpus. Parameters ---------- m : int Number of features (terms) in the corpus. k : int Desired rank of the decomposed matrix. docs : {iterable of list of (int, float), scipy.sparse.csc} Corpus in BoW format or as sparse matrix. use_svdlibc : bool, optional If True - will use `sparsesvd library <https://pypi.python.org/pypi/sparsesvd/>`_, otherwise - our own version will be used. power_iters: int, optional Number of power iteration steps to be used. Tune to improve accuracy. extra_dims : int, optional Extra samples to be used besides the rank `k`. Tune to improve accuracy. dtype : numpy.dtype, optional Enforces a type for elements of the decomposed matrix. """ self.m, self.k = m, k self.power_iters = power_iters self.extra_dims = extra_dims if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition, # *in-core*. if not use_svdlibc: u, s = stochastic_svd( docs, k, chunksize=sys.maxsize, num_terms=m, power_iters=self.power_iters, extra_dims=self.extra_dims, dtype=dtype) else: try: import sparsesvd except ImportError: raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix", str(docs.shape)) if not scipy.sparse.issparse(docs): docs = matutils.corpus2csc(docs) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) u = ut.T del ut, vt k = clip_spectrum(s ** 2, self.k) self.u = u[:, :k].copy() self.s = s[:k].copy() else: self.u, self.s = None, None
def main(model, dic, corpus, output): logging.basicConfig(level=logging.INFO) score = tfidf.scorer(model, dic) transforms = [] with open(corpus) as fp: n_sentences = sum(1 for line in fp) logging.info('Computing tf-idf vectors for %d sentences', n_sentences) bar = pb.ProgressBar(widgets=[pb.Percentage(), pb.Bar(), pb.ETA()], maxval=n_sentences) with open(corpus) as fp: for sentence in bar(fp): transforms.append(score(sentence.split())) logging.info('Saving tf-idf information to %s', output) with open(output, 'w') as fp: cPickle.dump(corpus2csc(transforms), fp, protocol=cPickle.HIGHEST_PROTOCOL)
def add_documents(self, corpus): """Extend the index with new documents. Parameters ---------- corpus : iterable of list of (int, number) Corpus in BoW format. Notes ----- Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). Examples -------- .. sourcecode:: pycon >>> from gensim.corpora.textcorpus import TextCorpus >>> from gensim.test.utils import datapath, get_tmpfile >>> from gensim.similarities import Similarity >>> >>> corpus = TextCorpus(datapath('testcorpus.mm')) >>> index_temp = get_tmpfile("index") >>> index = Similarity(index_temp, corpus, num_features=400) # create index >>> >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt')) >>> index.add_documents(one_more_corpus) # add more documents in corpus """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
def get_document_topics(self, bow, minimum_probability=None, normalize=None): """Get the topic distribution for the given document. Parameters ---------- bow : list of (int, float) The document in BOW format. minimum_probability : float If `normalize` is True, topics with smaller probabilities are filtered out. If `normalize` is False, topics with smaller factors are filtered out. If set to None, a value of 1e-8 is used to prevent 0s. normalize: bool or None, optional Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c. Returns ------- list of (int, float) Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and the probability that was assigned to it. """ if minimum_probability is None: minimum_probability = self.minimum_probability minimum_probability = max(minimum_probability, 1e-8) # if the input vector is a corpus, return a transformed corpus is_corpus, corpus = utils.is_corpus(bow) if is_corpus: kwargs = dict(minimum_probability=minimum_probability) return self._apply(corpus, **kwargs) v = matutils.corpus2csc([bow], self.num_tokens) h = self._solveproj(v, self._W, v_max=np.inf) if normalize is None: normalize = self.normalize if normalize: the_sum = h.sum() if the_sum: h /= the_sum return [ (idx, proba) for idx, proba in enumerate(h[:, 0]) if not minimum_probability or proba > minimum_probability ]
def __init__(self, m, k, docs = None, algo = 'onepass', chunks = None): """ Construct the (U, S) projection from a corpus `docs`. This is the class taking care of 'core math'; interfacing with corpora, chunking large corpora etc. is done through the LsiModel class. `algo` is currently one of: * 'onepass'; only a single pass over `docs` is needed * 'twopass'; multiple passes over the input allowed => can use a faster algorithm. """ self.m, self.k = m, k if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition in-core # results of several base case decompositions can be merged via `self.merge()` if algo == 'twopass': self.u, self.s = stochasticSvd(docs, k, chunks = chunks, num_terms = m) elif algo == 'onepass': if not scipy.sparse.issparse(docs): docs = matutils.corpus2csc(docs, num_terms = m) if docs.shape[1] <= max(k, 100): # For sufficiently small chunk size, update directly like `svd(now, docs)` # instead of `svd(now, svd(docs))`. # This improves accuracy and is also faster for small chunks, because # we need to perform one less svd. # On larger chunks this doesn't work because we quickly run out of memory. self.u = docs self.s = None else: try: import sparsesvd except ImportError: raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut, vt k = clipSpectrum(s ** 2, self.k) self.u, self.s = u[:, :k], s[:k] else: raise NotImplementedError("unknown decomposition algorithm: '%s'" % algo) else: self.u, self.s = None, None