Exemple #1
def do_classify():
    corpus = MyCorpus()
    # tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    # corpus_lsi = lsi_model[corpus_idf]
    num_terms = len(corpus.dictionary)
    # num_terms = 400
    corpus_sparse = matutils.corpus2csc(corpus_idf, num_terms).transpose(copy=False)
    # print corpus_sparse.shape
    # corpus_dense = matutils.corpus2dense(corpus_idf, len(corpus.dictionary))
    # print corpus_dense.shape
    penalty = "l2"
    clf = SGDClassifier(loss="hinge", penalty=penalty, alpha=0.0001, n_iter=50, fit_intercept=True)
    # clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)
    y = np.array(corpus.cls_y)
    # print y.shape
    clf.fit(corpus_sparse, y)
    filename = os.path.join(HERE, "sgdc_clf.pkl")
    _ = joblib.dump(clf, filename, compress=9)
    print "train completely"

    X_test = []
    X_label = []
    for obj in SogouCorpus.objects.filter(id__in=corpus.test_y):
        # result = classifier.predict(obj.tokens)
    test_corpus = [dictionary.doc2bow(s.split(",")) for s in X_test]
    test_corpus = tfidf_model[test_corpus]
    test_corpus = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    pred = clf.predict(test_corpus)
    score = metrics.f1_score(X_label, pred)
    print ("f1-score:   %0.3f" % score)
Exemple #2
def evaluate_improved_cllsi(x_train1_in, x_test1_in, x_train2_in, x_test2_in,
                            dimensions, evaluation_function):
    scores = []

    for k in dimensions:
        x_train1, x_test1 = tfidf(data=(x_train1_in, x_test1_in))
        x_train2, x_test2 = tfidf(data=(x_train2_in, x_test2_in))

        n_train, n_test = len(x_train1), len(x_test1)

        X1 = matutils.corpus2csc(list(x_train1) + list(x_test1))
        X2 = matutils.corpus2csc(list(x_train2) + list(x_test2))

        x_train1, x_train2 = X1[:, :n_train], X2[:, :n_train]
        x_test1, x_test2 = X1[:, n_train:], X2[:, n_train:]

        x = sp.sparse.vstack([x_train1, x_train2])
        x = matutils.Sparse2Corpus(x)

        lsa = models.LsiModel(x, num_topics=k)
        n = x_train1.shape[0]
        U = lsa.projection.u
        U1, U2 = U[:n, :], U[n:, :]
        p1, p2 = sp.sparse.csr_matrix(
            np.linalg.pinv(U1)), sp.sparse.csr_matrix(np.linalg.pinv(U2))
        a1, a2 = np.dot(x_test1.T, p1.T).todense(), np.dot(x_test2.T,

        score = evaluation_function(a1, a2)
    return scores
Exemple #3
    def get_similarities(self, query):
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
            if scipy.sparse.issparse(query):
                query = query.T # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc() # N x T * T x C = N x C
        if result.shape[1] == 1:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
Exemple #4
def load_data():

    df = pandas.read_csv('../data/train_clean2.csv')

    df1 = df[['qid1', 'question1']]
    df1.columns = ['qid', 'question']
    df2 = df[['qid2', 'question2']]
    df2.columns = ['qid', 'question']

    df_que = pandas.concat([df1, df2], ignore_index=True)
    df_que = df_que.drop_duplicates().fillna('').sort_values('qid')
    logger.info('df_que {}'.format(df_que.shape))
    train_num = df_que.shape[0]

    df = pandas.read_csv('../data/test_clean2.csv')
    df1 = df[['question1']]
    df1.columns = ['question']
    df2 = df[['question2']]
    df2.columns = ['question']
    df_que2 = pandas.concat([df1, df2], ignore_index=True)
    df_que2 = df_que2.drop_duplicates().fillna('')
    logger.info('df_que2 {}'.format(df_que2.shape))
    df_que2['qid'] = numpy.arange(df_que2.shape[0]) + df_que.shape[0]

    df_que = pandas.concat([df_que, df_que2], ignore_index=True)

    sentences = corpus_to_sentences(df_que['question'])
    dictionary = corpora.Dictionary(sentences)
    dictionary.filter_extremes(no_below=2, no_above=1., keep_n=2000000)
    p = Pool()
    id_corpus = p.map(dictionary.doc2bow, sentences)
    with open('count_corpus_brown.pkl', 'wb') as f:
        pickle.dump(id_corpus, f, -1)

    count_mat = corpus2csc(id_corpus).T
    logger.info('count_mat {}'.format(count_mat.shape))
    with open('count_mat_brown.pkl', 'wb') as f:
        pickle.dump(count_mat, f, -1)

    tfidf_instance = models.TfidfModel(id_corpus, normalize=False)
    tfidf_corpus = tfidf_instance[id_corpus]
    tfidf_mat = corpus2csc(tfidf_corpus).T
    logger.info('tfidf_mat {}'.format(tfidf_mat.shape))
    with open('tfidf_mat_brown.pkl', 'wb') as f:
        pickle.dump(tfidf_mat, f, -1)

    logger.info('df_que {}'.format(df_que.shape))

    logger.info('end load')
    return 0
Exemple #5
def get_tfidf_scores(kwargs):
    tfidf_transformer = TfIdfTransformer(**kwargs).fit(train_corpus)

    X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T
    X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T

    clf = LogisticRegression().fit(X_train_tfidf, y_train)

    model_accuracy = clf.score(X_test_tfidf, y_test)
    doc_scores = clf.decision_function(X_test_tfidf)

    return model_accuracy, doc_scores
Exemple #6
 def sim_rank(self, query, doc_corpus, query_norm=True, doc_norm=True, res_sort=True):
     assert isinstance(doc_corpus, (tuple, list)) and len(doc_corpus) > 0
     if not isinstance(doc_corpus[0], (tuple, list)):
         doc_corpus = [doc_corpus]
     corpus_vec = [self.text2vec(c, norm=doc_norm) for c in doc_corpus]
     doc_index = matutils.corpus2csc(corpus_vec, num_terms=self.dict_size, dtype=np.float32).T.tocsr()
     query_vec = self.text2vec(query, norm=query_norm)
     query_index = matutils.corpus2csc([query_vec], num_terms=self.dict_size, dtype=doc_index.dtype).tocsr()
     sim_array = doc_index * query_index
     sims = sim_array.toarray().T[0].tolist()
     # sims = doc_index[query_vec]
     return sorted(list(enumerate(sims)), key=lambda p: p[1], reverse=True) if res_sort else sims
Exemple #7
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Do not use this function directly; use the `self[query]` syntax instead.

        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
            Document or collection of documents.

            Similarity matrix (if maintain_sparsity=False) **OR**

        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query,
            if scipy.sparse.issparse(query):
                query = query.T  # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query,
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query],

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc()  # N x T * T x C = N x C
        if result.shape[1] == 1 and not is_corpus:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        elif self.maintain_sparsity:
            # avoid converting to dense array if maintaining sparsity
            result = result.T
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
Exemple #8
def Kmeans(n_clusters, args):
    corpus = corpora.MmCorpus(args[1])
    tfidf = models.tfidfmodel.TfidfModel.load(args[2])
    logging.info('finished loading corpus and tfidf models')

    # load the dictionary and corpus
    dictionary = corpora.Dictionary.load(args[0])
    tfidfValue = tfidf[corpus]
    logging.info('finished tfidfValue = tfidf[corpus]')

    fitdata = matutils.corpus2csc(tfidfValue,
        'finished transfer tfidf vector to sparse vector and transpose sparse vector'

    logging.info('begin kmeans fit')
    #TODO refactor parameters
    km = KMeans(n_clusters=n_clusters,
    logging.info('finished kmeans fit')
    # l = open('./TFIDF_KmeansTrend.txt','w')
    # for label in km.labels_:
    # 	l.write(str(label)+'\n')
    # l.close()
    return km.labels_
def get_labelled_csc_corpus():
    lsi_300_corpus = gensim.corpora.MmCorpus(
    labelled_df, labelled_indices, y_matrix = get_labelled_stories()
    labelled_lsi_corpus = lsi_300_corpus[labelled_indices]
    labelled_lsi_csc_corpus = corpus2csc(has_tags_lsi_corpus)
    return labelled_lsi_csc_corpus, y_matrix
Exemple #10
def reduce_nlp_data(vectorizer, data, n_components, reducer):

    transformed_data = vectorizer.fit_transform(data)
    id2word = {
        identifier: word
        for word, identifier in vectorizer.vocabulary_.items()

    if reducer == 'lda':
        corpus = matutils.Sparse2Corpus(transformed_data.transpose())
        lda = models.LdaModel(corpus=corpus,
        lda_corpus = lda[corpus]
        return lda, matutils.corpus2csc(lda_corpus).toarray().transpose()
    elif reducer == 'svd':
        SVD = TruncatedSVD(n_components, n_iter=10, random_state=42)
        svd_data = SVD.fit_transform(transformed_data)
        get_eigenvectors(SVD, id2word)
        return SVD, svd_data
    elif reducer == 'nmf':
        nmf = NMF(n_components, random_state=42)
        nmf_data = nmf.fit_transform(transformed_data)
        get_eigenvectors(nmf, id2word)
        return nmf, nmf_data

        return None, None
Exemple #11
    def fit(self, corpus):

        self.N = len(corpus)

        tokens = self.preprocessor.transform(corpus)
        self.observed_tokens = tokens.apply(len).sum()

        vocab = Dictionary(tokens)

        self.vocab = vocab
        self.corpus_as_tokens = tokens
        self.corpus_as_bow = [self.vocab.doc2bow(doc) for doc in tokens]
        self.corpus_as_csr = corpus2csc(self.corpus_as_bow,

        self.lengths = [len(d) for d in self.corpus_as_bow]
        self.num_empty_docs = self.lengths.count(0)

        time_now = time.localtime()
        self.created_on = time.strftime("%d %b %Y %H:%M:%S", time_now)

        return self
Exemple #12
def process_data(X):
    :param X: X is a pandas DataFrame of tweets (with no labels)
    :return: The feature matrix where each row is a tweet and each column is a feature. Ready to train/predict.
    tweets = process_texts(X)
    dictionary = corpora.Dictionary.load('dictionary.dict')
    lsi_model = models.LsiModel.load('lsi.model')

    # Transform each tweet (a string) to a row of bag of words vector in the corpus matrix.
    corpus = [dictionary.doc2bow(tweet.split()) for tweet in tweets]

    # transform the bag of words corpus matrix to LSI matrix of features.
    # To read more about LSI - https://en.wikipedia.org/wiki/Latent_semantic_analysis
    feature_mat = corpus2csc(lsi_model[corpus]).T.toarray()

    # Added features
    # feature_mat = count_occ('!', X)
    feature_mat = np.hstack((feature_mat, count_occ('!', X)))
    feature_mat = np.hstack((feature_mat, count_occ('@', X)))
    feature_mat = np.hstack((feature_mat, count_occ('?', X)))
    feature_mat = np.hstack((feature_mat, count_occ('❤', X)))
    feature_mat = np.hstack((feature_mat, count_occ('#', X)))
    feature_mat = np.hstack((feature_mat, count_occ('/https:', X)))
    feature_mat = np.hstack((feature_mat, chars_per_tweet(X)))
    feature_mat = np.hstack((feature_mat, words_per_tweet(X)))
    # feature_mat = zscore(feature_mat, axis=1)
    return feature_mat, feature_mat.shape[1]
Exemple #13
def condensify(train):
    Takes input either a string or a list of string
    Returns a list of all summaries;
    For a string returns a list with singleton document
    summ_list = []
    if isinstance(train,string):
        train = [train]
    for t in train:
        #corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary = corpora.Dictionary([w for w in reuters.sents(t)])
        corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)]
        matrix = matutils.corpus2csc(corpus)
        #print matrix
        u,sigma,vt = sparse.linalg.svds(matrix)
        (k,l)= vt.shape
        while k>=1:
            if reuters.sents(t)[vt[k-1].argmax()] not in summ:
        for s in summ:
            v.append(" ".join(s))
        summ = "".join(v)
    return (summ_list)
Exemple #14
def TF_IDF_Reg(corpus_list):

	tfidf_model = models.TfidfModel(corpus_list, normalize=True)
	courpus_list_tfidf = tfidf_model[corpus_list]
	word_matrix = matutils.corpus2csc(courpus_list_tfidf)

	return word_matrix
Exemple #15
    def add_documents(self, corpus):
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(
                        matutils.corpus2csc([doc], self.num_features).T)
                    doc = matutils.unitvec(
                        matutils.sparse2full(doc, self.num_features))
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i" %
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix):
    corpus = matutils.Dense2Corpus(numpy_matrix)

    numpy_matrix = matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)

    corpus = matutils.Sparse2Corpus(scipy_sparse_matrix)
    scipy_csc_matrix = matutils.corpus2csc(corpus)
Exemple #17
    def _setup(self, corpus):
        """Infer info from the first document and initialize matrices.

        corpus : iterable of list of (int, float), optional
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).

        self._h = None

        if isinstance(corpus, scipy.sparse.csc.csc_matrix):
            first_doc = corpus.getcol(0)
            first_doc_it = itertools.tee(corpus, 1)
            first_doc = next(first_doc_it[0])
            first_doc = matutils.corpus2csc([first_doc], len(self.id2word))
        self.w_std = np.sqrt(first_doc.mean() / (self.num_tokens * self.num_topics))

        self._W = np.abs(
            * halfnorm.rvs(
                size=(self.num_tokens, self.num_topics), random_state=self.random_state

        self.A = np.zeros((self.num_topics, self.num_topics))
        self.B = np.zeros((self.num_tokens, self.num_topics))
 def get_message_lsi_embedding_vector(self, message):
     test_corpus = [self.dictionary.doc2bow(message.split())]
     test_corpus_tfidf = self.tfidf[test_corpus]
     test_lsi = self.lsi[test_corpus_tfidf]
     test_vector = matutils.corpus2csc(test_lsi)
     message_array = test_vector.toarray().reshape(-1, 1).T
     return message_array
Exemple #19
    def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
                 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
                       matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(
                corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                dtype=dtype, printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr()  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Exemple #20
    def __init__(self, corpus, num_best=None, chunks=500, dtype=numpy.float32,
                 num_terms=None, num_docs=None, num_nnz=None):
        self.num_best = num_best
        self.normalize = True
        self.chunks = chunks

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
            self.index = matutils.corpus2csc((matutils.unitvec(vector) for vector in corpus),
                                              num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                                              dtype=numpy.float32, printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR
            logger.info("created %r" % self.index)
Exemple #21
    def _setup(self, corpus):
        """Infer info from the first document and initialize matrices.

        corpus : iterable of list(int, float)
            Training corpus.

        self._h, self._r = None, None
        first_doc_it = itertools.tee(corpus, 1)
        first_doc = next(first_doc_it[0])
        first_doc = matutils.corpus2csc([first_doc], len(self.id2word))
        self.w_std = np.sqrt(first_doc.mean() / (self.num_tokens * self.num_topics))

        self._W = np.abs(
            * halfnorm.rvs(
                size=(self.num_tokens, self.num_topics), random_state=self.random_state

        is_great_enough = self._W > self.w_std * self.sparse_coef

        self._W *= is_great_enough | ~is_great_enough.all(axis=0)

        self._W = scipy.sparse.csc_matrix(self._W)

        self.A = scipy.sparse.csr_matrix((self.num_topics, self.num_topics))
        self.B = scipy.sparse.csc_matrix((self.num_tokens, self.num_topics))
 def getLDaTopics(self,
                  gram=(1, 2),
     dictionary, doc_term_matrix = self.prepare_corpus(doc, gram, option)
     # generate LDA model
     lda = models.LdaModel(corpus=doc_term_matrix,
                           random_state=1)  # train model
     corpus_transformed = lda[doc_term_matrix]
     all_topics_csr = matutils.corpus2csc(corpus_transformed)
     all_topics_numpy = all_topics_csr.T.toarray()
     Lda_Topic = pd.DataFrame(all_topics_numpy, doc)
     print('shape ', Lda_Topic.shape)
     return Lda_Topic
    def __init__(self, m, k, docs = None):
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition 
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if m * k < 10000:
                # SVDLIBC gives spurious results for small matrices.. run full
                # LAPACK svd on them instead
                docs = docs.todense()
                logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                u, s, vt = numpy.linalg.svd(docs, full_matrices = False)
                    import sparsesvd
                except ImportError:
                    raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut
            del vt
            k = clipSpectrum(s, self.k)
            self.u, self.s = u[:, :k], s[:k]
            self.u, self.s = None, None
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS):
        Construct the (U, S) projection from a corpus `docs`. The projection can
        be later updated by merging it with another Projection via `self.merge()`.

        This is the class taking care of the 'core math'; interfacing with corpora,
        splitting large corpora into chunks and merging them etc. is done through
        the higher-level `LsiModel` class.
        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut, vt
                k = clip_spectrum(s**2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
            self.u, self.s = None, None
Exemple #25
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS):
        Construct the (U, S) projection from a corpus `docs`. The projection can
        be later updated by merging it with another Projection via `self.merge()`.

        This is the class taking care of the 'core math'; interfacing with corpora,
        splitting large corpora into chunks and merging them etc. is done through
        the higher-level `LsiModel` class.
        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut, vt
                k = clip_spectrum(s**2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
            self.u, self.s = None, None
Exemple #26
def _doc_doc_mtx(table, model, input_col, result_type='sparse'):
    corpus = table[input_col].tolist()

    dictionary = model['dictionary']

    bow_corpus = []
    for doc in corpus:

    csr_matrix = matutils.corpus2csc(bow_corpus).T
    csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))])
    doc_doc = (csr_matrix @ (csr_matrix.T)).tocoo()

    if result_type == 'sparse':
        doc_doc = sparse.triu(doc_doc, k=1)
        out_table = pd.DataFrame(doc_doc.row, columns=['1st_document_idx'])
        out_table['2nd_document_idx'] = doc_doc.col
        out_table['number_of_common_terms'] = doc_doc.data
    elif result_type == 'dense':
        doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))]
        out_table = pd.DataFrame(doc_doc.todense())
        out_table.insert(loc=0, column=' ', value=doc_idx)
        out_table.columns = np.append("", doc_idx)
        raise_runtime_error("Please check 'result_type'.")

    rb = BrtcReprBuilder()
    model = _model_dict('doc_doc_mtx')
    model['input_col'] = input_col
    model['doc_doc_mtx'] = doc_doc
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table}
    def __init__(self, m, k, docs = None):
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition 
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if m * k < 10000:
                # SVDLIBC gives spurious results for small matrices.. run full
                # LAPACK svd on them instead
                docs = docs.todense()
                logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                u, s, vt = numpy.linalg.svd(docs, full_matrices = False)
                    import sparsesvd
                except ImportError:
                    raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut
            del vt
            k = clipSpectrum(s, self.k)
            self.u, self.s = u[:, :k], s[:k]
            self.u, self.s = None, None
Exemple #28
def classify_content(content):
    num_terms = len(dictionary)
    test_corpus = tfidf_model[dictionary.doc2bow(list(Tokenize(content)))]
    test_sparse = matutils.corpus2csc([test_corpus],
    result = sg_class.predict(test_sparse)
    return id2cls[result[0]]
Exemple #29
def train():

    with open('count_corpus.pkl', 'rb') as f:
        id_corpus = pickle.load(f)

    #lda = models.ldamulticore.LdaMulticore(corpus=id_corpus, num_topics=50)

    # lda.save('lda.model')

    p = Pool()
    aaa = p.map(pred, id_corpus)
    result = numpy.asarray(corpus2csc(aaa).T.todense())
    map_train, map_test, train_num = make_idmap()

    df = pandas.read_csv('../data/train_clean.csv')[['question1', 'question2'
    df_train = pandas.DataFrame(_train(result[:train_num], df, map_train))
    df_train.to_csv('lda_train.csv', index=False)

    df = pandas.read_csv('../data/test_clean.csv')[['question1', 'question2'
    df_test = pandas.DataFrame(_train(result[train_num:], df, map_test))
    df_test.to_csv('lda_test.csv', index=False)
Exemple #30
def tfidf(counts_mat):
    m,n = counts_mat.shape
    # utilize gensim
    corpus = matutils.Sparse2Corpus(counts_mat)
    tfidf = models.logentropy_model.LogEntropyModel(corpus)
    c_tfidf = tfidf[corpus]
    return matutils.corpus2csc(c_tfidf)
Exemple #31
def process_records(records, fields, target, textmodel=None):
	tokenize = CountVectorizer().build_analyzer()

	input = None
	X = None
	y_labels = []

	for i, record in enumerate(records):
		nums = []
		strs = []

		for field in fields:
			if is_number(record.get(field)):
				strs.append(str(record.get(field) or "").lower())
		if strs:
			if input is None:
				input = StringIO.StringIO()
			print >> input, " ".join(tokenize(" ".join(strs)))
		if nums:
			if X is None:
				X = sp.lil_matrix((len(records),len(nums)))
			X[i] = np.array(nums, dtype=np.float64)

	if input is not None:
		if X is not None:
			X_2 = X.tocsr()
			X_2 = None

		if isinstance(textmodel,basestring):
			if textmodel == 'lsi':
				corpus = TextCorpus(input)
				textmodel = LsiModel(corpus, chunksize=1000)
			elif textmodel == 'tfidf':
				corpus = TextCorpus(input)
				textmodel = TfidfModel(corpus)
			elif textmodel == 'hashing':
				textmodel = None
				hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
				X = hasher.transform(tokenize(line.strip()) for line in input)
		if textmodel:
			num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
			X = corpus2csc(textmodel[corpus], num_terms).transpose()

		if X_2 is not None:
			# print >> sys.stderr, "X SHAPE:", X.shape
			# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
			X = sp.hstack([X, X_2], format='csr')

	elif X is not None:
		textmodel = None
		X = X.tocsr()

	print >> sys.stderr, "X SHAPE:", X.shape

	return X, y_labels, textmodel
Exemple #32
    def __getitem__(self, bow, scaled=False, chunksize=512):
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform 256 documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # 256 smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms)

        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist.flat)
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Exemple #33
def condensify(train):
    Takes input either a string or a list of string
    Returns a list of all summaries;
    For a string returns a list with singleton document
    summ_list = []
    if isinstance(train, string):
        train = [train]
    for t in train:
        summ = []
        k = 0
        #corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary = corpora.Dictionary([w for w in reuters.sents(t)])
        corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)]
        matrix = matutils.corpus2csc(corpus)
        #print matrix
        u, sigma, vt = sparse.linalg.svds(matrix)
        (k, l) = vt.shape
        while k >= 1:
            if reuters.sents(t)[vt[k - 1].argmax()] not in summ:
                summ.append(reuters.sents(t)[vt[k - 1].argmax()])
            k -= 1
        v = []
        for s in summ:
            v.append(" ".join(s))
        summ = "".join(v)
    return (summ_list)
Exemple #34
def train(args, output_dir):
    """Build the corpus, trains the DTM, and saves the model to the output
    corpus = Corpus()

    # Create the dictionary.
    dictionary = Dictionary(corpus.debates.bag_of_words)

    # Save empirical term distribution within each time step.
    term_counts = corpus2csc(
            'bag_of_words': 'sum'
    save_npz(os.path.join(output_dir, 'term_counts.npz'), term_counts)

    # Train and save dtm.
    time_slices = corpus.debates.groupby('year').size()
    dtm_corpus = corpus.debates.bag_of_words.apply(dictionary.doc2bow)
    model = Dtm(args.executable,
    model.save(os.path.join(output_dir, 'dtm.gensim'))
Exemple #35
    def _transform(self, corpus, source_dict=None):
        temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
        dic = corpora.Dictionary(
            temp_corpus, prune_at=None) if not source_dict else source_dict
        temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
        model = models.TfidfModel(temp_corpus,

        X = matutils.corpus2csc(model[temp_corpus],
        norm = self.norms[self.norm]
        if norm:
            X = norm(X)

        # set compute values
        shared_cv = SharedTransform(self,
        cv = [
            VectorizationComputeValue(shared_cv, dic[i])
            for i in range(len(dic))

        corpus = self.add_features(corpus,
                                   var_attrs={'bow-feature': True})
        return corpus
Exemple #36
    def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000):
        is_corpus, current_representation = utils.is_corpus(current_representation)
        if is_corpus:
            for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)):
                ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk))
                assert num_terms is not None, "Need num_terms to properly handle sparse corpus format"
                chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms)

                ln.debug("Chunk converted to csc, running through layer..")
                chunk_trans = layer.__getitem__(chunk_as_csc)

                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("Finished serializing chunk. Processed %s documents so far." %
                         (chunk_no * chunksize + len(chunk)))
            ln.info("Beginning serialization of non-gensim corpus format intermediate representation.")
            ln.debug("Type of current_representation is %s" % type(current_representation))
            for chunk_no, chunk in enumerate(current_representation):
                ln.debug("converting chunk (%s documents)..." % chunksize)
                chunk_trans = layer.__getitem__(chunk)
                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("finished serializing chunk.")

        ln.info("Finished serializing all chunks.")
Exemple #37
    def _get_vectors(self):
        processed_corpus = [self._tokenize(doc["document"], self.phraser) for doc in self.docs]
        self.dictionary = corpora.Dictionary(processed_corpus)

        # ignore 20% most frequent words
        # num_unique_words = len(dictionary)
        # dictionary.filter_n_most_frequent(int(num_unique_words*0.2))

        # do some more filtering and keep only n most frequent specified with num_dims parameter
        self.dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=self.num_dims)

        bow_corpus = [self.dictionary.doc2bow(text) for text in processed_corpus]

        if self.vectorizer == "TfIdf Vectorizer":
            self.tfidf_model = models.TfidfModel(bow_corpus)
            transformed_corpus = self.tfidf_model[bow_corpus]
        elif self.vectorizer == "Count Vectorizer":
            transformed_corpus = bow_corpus

        if self.use_lsi:
            self.lsi_model = models.LsiModel(transformed_corpus, id2word=self.dictionary, num_topics=self.num_topics)
            transformed_corpus = self.lsi_model[transformed_corpus]

        matrix = corpus2csc(transformed_corpus, num_terms=len(self.dictionary.keys()), num_docs=self.dictionary.num_docs)
        return matrix.transpose()
Exemple #38
    def get_intracluster_similarity(self, new_documents=[], phraser=None):
        if len(new_documents) > 0:
            dictionary = self.models["dictionary"]

            for doc in new_documents:
                processed_text = Clustering._tokenize(doc["text"], phraser=phraser)
                doc_vec = [dictionary.doc2bow(processed_text)]

                if self.models["tfidf_model"] is not None:
                    doc_vec = self.models["tfidf_model"][doc_vec]

                if self.models["lsi_model"] is not None:
                    doc_vec = self.models["lsi_model"][doc_vec]

                full_vec = corpus2csc(doc_vec, num_terms=len(dictionary.keys()), num_docs=dictionary.num_docs)
                full_vec = full_vec.transpose()
                self.models["doc_vectors"][doc["id"]] = full_vec[0].toarray()[0]


        if self.doc_ids:
            cluster_vectors = []
            for doc_id in self.doc_ids:
            similarities = cosine_similarity(cluster_vectors)
            return np.mean(similarities)
            return 0
Exemple #39
    def add_documents(self, corpus):
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
Exemple #40
    def update(self, corpus):
        """Train the model with new documents.

        corpus : iterable of list of (int, float), optional
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).

        if self._W is None:

        chunk_idx = 1

        for _ in range(self.passes):
            if isinstance(corpus, scipy.sparse.csc.csc_matrix):
                grouper = (
                    corpus[:, col_idx:col_idx + self.chunksize]
                    for col_idx
                    in range(0, corpus.shape[1], self.chunksize)
                grouper = utils.grouper(corpus, self.chunksize)

            for chunk in grouper:
                if isinstance(corpus, scipy.sparse.csc.csc_matrix):
                    v = chunk[:, self.random_state.permutation(chunk.shape[1])]

                    v = matutils.corpus2csc(

                self._h = self._solveproj(v, self._W, h=self._h, v_max=self.v_max)
                h = self._h

                self.A *= chunk_idx - 1
                self.A += h.dot(h.T)
                self.A /= chunk_idx

                self.B *= chunk_idx - 1
                self.B += v.dot(h.T)
                self.B /= chunk_idx

                prev_w_error = self._w_error


                if chunk_idx % self.eval_every == 0:
                    logger.info("Loss: {}".format(self._w_error / prev_w_error))

                chunk_idx += 1

        logger.info("Loss: {}".format(self._w_error / prev_w_error))
def main():
    parser = argparse.ArgumentParser(
        'calculates various stats and of a given document-author-contribs file'
        help='path to input MatrixMarket acc contributions file (.mm/.mm.bz2)',
                        help='prefix of output generated img files',
                        help='quantile of histrograms to consider',

    args = parser.parse_args()
    input_acc_contribs_dump_path = args.acc_contribs.name
    output_image_prefix = args.img_prefix
    quantile_order = args.quantile_order

    logger.info('running with:\n{}'.format(
            'input_acc_contribs_dump_path': input_acc_contribs_dump_path,
            'output_image_prefix': output_image_prefix,
            'quantile_order': quantile_order

    acc_contribs = MmCorpus(input_acc_contribs_dump_path)
    logger.info('reading corpus to sparse csr matrix')
    csr_corpus = corpus2csc(acc_contribs).T.tocsr()
    logger.info('generated sparse matrix of shape {}'.format(csr_corpus.shape))
    logger.debug('sparse matrix \n{}'.format(csr_corpus))

    logger.info('calculating authors-per-docs-distribution')
    num_authors_per_doc = sp.find((csr_corpus > 0).sum(1))[2]
    quantile = get_quantile(num_authors_per_doc, quantile_order)
    num_authors_per_doc = num_authors_per_doc[num_authors_per_doc <= quantile]
    num_authors_per_doc_imgfile = output_image_prefix + '-num-auths-per-doc-dist.pdf'
    xlabel = 'Autoren je Dokument'
    ylabel = 'Häufigkeit'
    num_authors, num_authors_counts = np.unique(num_authors_per_doc,
    bar_plot(num_authors, num_authors_counts, num_authors_per_doc_imgfile,
             xlabel, ylabel)

    logger.info('calculating docs-per-authors-distribution')
    num_docs_per_author = sp.find((csr_corpus > 0).sum(0).T)[2]
    quantile = get_quantile(num_docs_per_author, quantile_order)
    num_docs_per_author = num_docs_per_author[num_docs_per_author <= quantile]
    num_docs_per_author_imgfile = output_image_prefix + '-num-docs-per-auth-dist.pdf'
    xlabel = 'Dokumente je Autor'
    ylabel = 'Häufigkeit'
    num_docs, num_docs_counts = np.unique(num_docs_per_author,
    bar_plot(num_docs, num_docs_counts, num_docs_per_author_imgfile, xlabel,
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
    if not matutils.ismatrix(corpus):
        corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary))
        corpus_csc = corpus
        # Need corpus to be a streaming gensim list corpus for len and inference functions below:
        corpus = matutils.Sparse2Corpus(corpus_csc)

    beta = 0.01
    fnames_argsort = np.asarray(list(dictionary.token2id.values()),
    term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
    term_freqs[term_freqs == 0] = beta
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    assert term_freqs.shape[0] == len(
    ), 'Term frequencies and dictionary have different shape {} != {}'.format(
        term_freqs.shape[0], len(dictionary))
    assert doc_lengths.shape[0] == len(
    ), 'Document lengths and corpus have different sizes {} != {}'.format(
        doc_lengths.shape[0], len(corpus))

    if hasattr(topic_model, 'lda_alpha'):
        num_topics = len(topic_model.lda_alpha)
        num_topics = topic_model.num_topics

    if doc_topic_dists is None:
        # If its an HDP model.
        if hasattr(topic_model, 'lda_beta'):
            gamma = topic_model.inference(corpus)
            gamma, _ = topic_model.inference(corpus)
        doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
        if isinstance(doc_topic_dists, list):
            doc_topic_dists = matutils.corpus2dense(doc_topic_dists,
        elif issparse(doc_topic_dists):
            doc_topic_dists = doc_topic_dists.T.todense()
        doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

    assert doc_topic_dists.shape[
        1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(
            doc_topic_dists.shape[1], num_topics)

    # get the topic-term distribution straight from gensim without
    # iterating over tuples
    if hasattr(topic_model, 'lda_beta'):
        topic = topic_model.lda_beta
        topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]

    assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]
    return doc_topic_dists
Exemple #43
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Do not use this function directly; use the `self[query]` syntax instead.

        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
            Document or collection of documents.

            Similarity matrix (if maintain_sparsity=False) **OR**

        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
            if scipy.sparse.issparse(query):
                query = query.T  # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc()  # N x T * T x C = N x C
        if result.shape[1] == 1 and not is_corpus:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        elif self.maintain_sparsity:
            # avoid converting to dense array if maintaining sparsity
            result = result.T
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
 def transform(self, X):
     x_clean = tu.clean_and_tokenize(X, stoplist=self.stoplist)
     x_tfidf = self.tfidf[[self.dictionary.doc2bow(text) for text in x_clean]]
     x_data = matutils.corpus2csc(x_tfidf, num_terms=len(self.dictionary)).T
     #x_data = matutils.corpus2dense(x_tfidf, num_terms=len(self.dictionary)).T
     #logging.info("Returning data of shape %s " % (len(x_data)))
     #returning a csr matrix
     return x_data
def setup(files):
    # setup the output directory
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_dir = '../browser/json/' + base_model_name + '/'
    if not os.path.exists(output_dir):
    # load the topic model
    model = LdaModel.load(files.model)
    # load replacements used
    bug_to_id = json.loads(open(files.replacements).read())
    # invert to id<->bug map, ditching s. genus terms
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}
    # load the docsXwords and docsXtopics matrices (in sparse format)
    corpus = mmcorpus.MmCorpus(files.corpus)
    docsXwords_sparse = corpus2csc(corpus, num_terms=len(model.id2word.token2id)).T
    docsXtopics = mmcorpus.MmCorpus(files.docsXtopics)
    docsXtopics_sparse = corpus2csc(docsXtopics).T
    return docsXtopics_sparse, docsXwords_sparse, id_to_bug, model, output_dir
def hierarchical_clustering(corpus_fn, n_clusters=2, linkage='complete'):
    corpus = corpora.MmCorpus(corpus_fn)
    corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose()
    svd = TruncatedSVD(n_components=100)
    new_corpus = svd.fit_transform(corpus)
    knn_graph = kneighbors_graph(new_corpus, 10, metric='euclidean')
    agg = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=linkage, connectivity=knn_graph)
    return corpus, agg.labels_
def convert_to_X_y(model_class, params, data, label):
	model = model_class(**params)
	mat = model[data]

	X = corpus2csc(mat)
	y = np.array(label)

	return X.T, y
Exemple #48
    def __getitem__(self, bow, scaled=False, chunksize=512):
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.

        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).

        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype)
        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse),
        # but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Exemple #49
 def readTest(self, test_deals_file):
     read test data
     @param test_deals
     corpus = [self.dict.doc2bow(line.split()) for line in open(test_deals_file, 'r')]
     self.X_test = matutils.corpus2csc(corpus, num_terms=len(self.dict)).T
     #joblib.dump(self.X_test, 'task3_X_test.pkl')
     print 'readTest Done!'
Exemple #50
def load_sparse_dataset(colname,suffix):
    corpus = corpora.MmCorpus('vsm/{}.{}'.format(colname,suffix))
    # with documents as columns
    X = matutils.corpus2csc(corpus,printprogress=1)
    # transpose to make each document a row
    X = X.T

    y = pd.read_csv("vsm/{}_meta.csv".format(colname),index_col='id')
    y = y.iloc[:,0]# DataFrame to Series

    return X,y
Exemple #51
def evaluate(corpus_fn, labels_fn):
    corpus = corpora.MmCorpus(corpus_fn)
    labels = pickle.load(open(labels_fn))
    corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose()
    scores = []
    for i in range(16):
        score = metrics.silhouette_score(corpus, labels, metric='cosine', sample_size=5000)
        print score
    sc = numpy.array(scores).mean()
    print 'mean', sc
Exemple #52
def load_tfidf(tag):
    corpus = corpora.MmCorpus("{}.tfidf".format(tag))
    # with documents as columns
    X = matutils.corpus2csc(corpus, printprogress=1)
    # transpose to make each document a row
    X = X.T

    y = pd.read_csv("{}_labels.csv".format(tag), index_col="date")
    y = y.iloc[:, 0]  # DataFrame to Series

    return X, y
Exemple #53
def key_words(keys, topk=18):
    vec_bow = dictionary.doc2bow(keys)
    tfidf_corpus = tfidf_model[vec_bow]

    num_terms = len(dictionary)
    test_sparse = matutils.corpus2csc([tfidf_corpus], num_terms).transpose(copy=False)
    result = sg_class.predict(test_sparse)

    words = [dictionary[d].decode("utf-8") for d, _ in sorted(list(tfidf_corpus), key=lambda item: -item[1])[0:topk]]
    classify = id2cls[result[0]]

    return (words, classify)
Exemple #54
        def transformed_corpus():
            for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)):
                ln.debug("Converting chunk %s to csc format.." % chunk_no)
                chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality)
                ln.debug("Computing hidden representation for chunk.. ")
                hidden = self._get_hidden_representations(chunk)
                ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" %
                        (chunk_no, chunk_no * chunksize + len(doc_chunk)))
                for column in hidden.T:
                    yield matutils.dense2vec(column.T)
                ln.debug("Done yielding chunk %s" % chunk_no)

            ln.info("Finished computing representations for all chunks.")
Exemple #55
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS,
                 extra_dims=P2_EXTRA_DIMS, dtype=np.float64):
        """Construct the (U, S) projection from a corpus.

        m : int
            Number of features (terms) in the corpus.
        k : int
            Desired rank of the decomposed matrix.
        docs : {iterable of list of (int, float), scipy.sparse.csc}
            Corpus in BoW format or as sparse matrix.
        use_svdlibc : bool, optional
            If True - will use `sparsesvd library <https://pypi.python.org/pypi/sparsesvd/>`_,
            otherwise - our own version will be used.
        power_iters: int, optional
            Number of power iteration steps to be used. Tune to improve accuracy.
        extra_dims : int, optional
            Extra samples to be used besides the rank `k`. Tune to improve accuracy.
        dtype : numpy.dtype, optional
            Enforces a type for elements of the decomposed matrix.

        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(
                    docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    extra_dims=self.extra_dims, dtype=dtype)
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix", str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30)
                u = ut.T
                del ut, vt
                k = clip_spectrum(s ** 2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
            self.u, self.s = None, None
Exemple #56
def main(model, dic, corpus, output):
    score = tfidf.scorer(model, dic)
    transforms = []
    with open(corpus) as fp:
        n_sentences = sum(1 for line in fp)
    logging.info('Computing tf-idf vectors for %d sentences', n_sentences)
    bar = pb.ProgressBar(widgets=[pb.Percentage(), pb.Bar(), pb.ETA()], maxval=n_sentences)
    with open(corpus) as fp:
        for sentence in bar(fp):
    logging.info('Saving tf-idf information to %s', output)
    with open(output, 'w') as fp:
        cPickle.dump(corpus2csc(transforms), fp, protocol=cPickle.HIGHEST_PROTOCOL)
Exemple #57
    def add_documents(self, corpus):
        """Extend the index with new documents.

        corpus : iterable of list of (int, number)
            Corpus in BoW format.

        Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them
        (or when a query is issued).

        .. sourcecode:: pycon

            >>> from gensim.corpora.textcorpus import TextCorpus
            >>> from gensim.test.utils import datapath, get_tmpfile
            >>> from gensim.similarities import Similarity
            >>> corpus = TextCorpus(datapath('testcorpus.mm'))
            >>> index_temp = get_tmpfile("index")
            >>> index = Similarity(index_temp, corpus, num_features=400)  # create index
            >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt'))
            >>> index.add_documents(one_more_corpus)  # add more documents in corpus

        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
Exemple #58
    def get_document_topics(self, bow, minimum_probability=None,
        """Get the topic distribution for the given document.

        bow : list of (int, float)
            The document in BOW format.
        minimum_probability : float
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        list of (int, float)
            Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and
            the probability that was assigned to it.

        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)

        if is_corpus:
            kwargs = dict(minimum_probability=minimum_probability)
            return self._apply(corpus, **kwargs)

        v = matutils.corpus2csc([bow], self.num_tokens)
        h = self._solveproj(v, self._W, v_max=np.inf)

        if normalize is None:
            normalize = self.normalize
        if normalize:
            the_sum = h.sum()
            if the_sum:
                h /= the_sum

        return [
            (idx, proba)
            for idx, proba in enumerate(h[:, 0])
            if not minimum_probability or proba > minimum_probability
 def __init__(self, m, k, docs = None, algo = 'onepass', chunks = None):
     Construct the (U, S) projection from a corpus `docs`. 
     This is the class taking care of 'core math'; interfacing with corpora, 
     chunking large corpora etc. is done through the LsiModel class.
     `algo` is currently one of:
       * 'onepass'; only a single pass over `docs` is needed
       * 'twopass'; multiple passes over the input allowed => can use a 
         faster algorithm.
     self.m, self.k = m, k
     if docs is not None:
         # base case decomposition: given a job `docs`, compute its decomposition in-core
         # results of several base case decompositions can be merged via `self.merge()`
         if algo == 'twopass':
             self.u, self.s = stochasticSvd(docs, k, chunks = chunks, num_terms = m)
         elif algo == 'onepass':
             if not scipy.sparse.issparse(docs):
                 docs = matutils.corpus2csc(docs, num_terms = m)
             if docs.shape[1] <= max(k, 100):
                 # For sufficiently small chunk size, update directly like `svd(now, docs)` 
                 # instead of `svd(now, svd(docs))`.
                 # This improves accuracy and is also faster for small chunks, because
                 # we need to perform one less svd.
                 # On larger chunks this doesn't work because we quickly run out of memory.
                 self.u = docs
                 self.s = None
                     import sparsesvd
                 except ImportError:
                     raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                 logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                 ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                 u = ut.T
                 del ut, vt
                 k = clipSpectrum(s ** 2, self.k)
                 self.u, self.s = u[:, :k], s[:k]
             raise NotImplementedError("unknown decomposition algorithm: '%s'" % algo)
         self.u, self.s = None, None