Example #1
0
def do_classify():
    corpus = MyCorpus()
    # tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    # corpus_lsi = lsi_model[corpus_idf]
    num_terms = len(corpus.dictionary)
    # num_terms = 400
    corpus_sparse = matutils.corpus2csc(corpus_idf, num_terms).transpose(copy=False)
    # print corpus_sparse.shape
    # corpus_dense = matutils.corpus2dense(corpus_idf, len(corpus.dictionary))
    # print corpus_dense.shape
    penalty = "l2"
    clf = SGDClassifier(loss="hinge", penalty=penalty, alpha=0.0001, n_iter=50, fit_intercept=True)
    # clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)
    y = np.array(corpus.cls_y)
    # print y.shape
    clf.fit(corpus_sparse, y)
    filename = os.path.join(HERE, "sgdc_clf.pkl")
    _ = joblib.dump(clf, filename, compress=9)
    print "train completely"

    X_test = []
    X_label = []
    for obj in SogouCorpus.objects.filter(id__in=corpus.test_y):
        X_test.append(obj.tokens)
        X_label.append(cls_ids[obj.classify])
        # result = classifier.predict(obj.tokens)
    test_corpus = [dictionary.doc2bow(s.split(",")) for s in X_test]
    test_corpus = tfidf_model[test_corpus]
    test_corpus = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    pred = clf.predict(test_corpus)
    score = metrics.f1_score(X_label, pred)
    print ("f1-score:   %0.3f" % score)
Example #2
0
def evaluate_improved_cllsi(x_train1_in, x_test1_in, x_train2_in, x_test2_in,
                            dimensions, evaluation_function):
    scores = []

    for k in dimensions:
        x_train1, x_test1 = tfidf(data=(x_train1_in, x_test1_in))
        x_train2, x_test2 = tfidf(data=(x_train2_in, x_test2_in))

        n_train, n_test = len(x_train1), len(x_test1)

        X1 = matutils.corpus2csc(list(x_train1) + list(x_test1))
        X2 = matutils.corpus2csc(list(x_train2) + list(x_test2))

        x_train1, x_train2 = X1[:, :n_train], X2[:, :n_train]
        x_test1, x_test2 = X1[:, n_train:], X2[:, n_train:]

        x = sp.sparse.vstack([x_train1, x_train2])
        x = matutils.Sparse2Corpus(x)

        lsa = models.LsiModel(x, num_topics=k)
        n = x_train1.shape[0]
        U = lsa.projection.u
        U1, U2 = U[:n, :], U[n:, :]
        p1, p2 = sp.sparse.csr_matrix(
            np.linalg.pinv(U1)), sp.sparse.csr_matrix(np.linalg.pinv(U2))
        a1, a2 = np.dot(x_test1.T, p1.T).todense(), np.dot(x_test2.T,
                                                           p2.T).todense()

        score = evaluation_function(a1, a2)
        scores.append(score)
    return scores
Example #3
0
    def get_similarities(self, query):
        """
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.T # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
            else:
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc() # N x T * T x C = N x C
        if result.shape[1] == 1:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        else:
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
Example #4
0
def load_data():

    df = pandas.read_csv('../data/train_clean2.csv')

    df1 = df[['qid1', 'question1']]
    df1.columns = ['qid', 'question']
    df2 = df[['qid2', 'question2']]
    df2.columns = ['qid', 'question']

    df_que = pandas.concat([df1, df2], ignore_index=True)
    df_que = df_que.drop_duplicates().fillna('').sort_values('qid')
    logger.info('df_que {}'.format(df_que.shape))
    train_num = df_que.shape[0]

    df = pandas.read_csv('../data/test_clean2.csv')
    df1 = df[['question1']]
    df1.columns = ['question']
    df2 = df[['question2']]
    df2.columns = ['question']
    df_que2 = pandas.concat([df1, df2], ignore_index=True)
    df_que2 = df_que2.drop_duplicates().fillna('')
    logger.info('df_que2 {}'.format(df_que2.shape))
    df_que2['qid'] = numpy.arange(df_que2.shape[0]) + df_que.shape[0]

    df_que = pandas.concat([df_que, df_que2], ignore_index=True)

    sentences = corpus_to_sentences(df_que['question'])
    logger.info('dict')
    dictionary = corpora.Dictionary(sentences)
    dictionary.save('./gensim.dict')
    dictionary.filter_extremes(no_below=2, no_above=1., keep_n=2000000)
    p = Pool()
    id_corpus = p.map(dictionary.doc2bow, sentences)
    p.close()
    p.join()
    with open('count_corpus_brown.pkl', 'wb') as f:
        pickle.dump(id_corpus, f, -1)

    count_mat = corpus2csc(id_corpus).T
    logger.info('count_mat {}'.format(count_mat.shape))
    with open('count_mat_brown.pkl', 'wb') as f:
        pickle.dump(count_mat, f, -1)

    tfidf_instance = models.TfidfModel(id_corpus, normalize=False)
    tfidf_corpus = tfidf_instance[id_corpus]
    tfidf_mat = corpus2csc(tfidf_corpus).T
    logger.info('tfidf_mat {}'.format(tfidf_mat.shape))
    with open('tfidf_mat_brown.pkl', 'wb') as f:
        pickle.dump(tfidf_mat, f, -1)

    logger.info('df_que {}'.format(df_que.shape))

    logger.info('end load')
    return 0
Example #5
0
def get_tfidf_scores(kwargs):
    tfidf_transformer = TfIdfTransformer(**kwargs).fit(train_corpus)

    X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T
    X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T

    clf = LogisticRegression().fit(X_train_tfidf, y_train)

    model_accuracy = clf.score(X_test_tfidf, y_test)
    doc_scores = clf.decision_function(X_test_tfidf)

    return model_accuracy, doc_scores
Example #6
0
 def sim_rank(self, query, doc_corpus, query_norm=True, doc_norm=True, res_sort=True):
     assert isinstance(doc_corpus, (tuple, list)) and len(doc_corpus) > 0
     if not isinstance(doc_corpus[0], (tuple, list)):
         doc_corpus = [doc_corpus]
     corpus_vec = [self.text2vec(c, norm=doc_norm) for c in doc_corpus]
     doc_index = matutils.corpus2csc(corpus_vec, num_terms=self.dict_size, dtype=np.float32).T.tocsr()
     query_vec = self.text2vec(query, norm=query_norm)
     query_index = matutils.corpus2csc([query_vec], num_terms=self.dict_size, dtype=doc_index.dtype).tocsr()
     sim_array = doc_index * query_index
     sims = sim_array.toarray().T[0].tolist()
     # sims = doc_index[query_vec]
     return sorted(list(enumerate(sims)), key=lambda p: p[1], reverse=True) if res_sort else sims
Example #7
0
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix (if maintain_sparsity=False) **OR**
        :class:`scipy.sparse.csc`
            otherwise

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query,
                                        self.index.shape[1],
                                        dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.T  # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query,
                                                dtype=self.index.dtype).T
            else:
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query],
                                            self.index.shape[1],
                                            dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc()  # N x T * T x C = N x C
        if result.shape[1] == 1 and not is_corpus:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        elif self.maintain_sparsity:
            # avoid converting to dense array if maintaining sparsity
            result = result.T
        else:
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
Example #8
0
def Kmeans(n_clusters, args):
    corpus = corpora.MmCorpus(args[1])
    tfidf = models.tfidfmodel.TfidfModel.load(args[2])
    logging.info('finished loading corpus and tfidf models')

    # load the dictionary and corpus
    dictionary = corpora.Dictionary.load(args[0])
    tfidfValue = tfidf[corpus]
    logging.info('finished tfidfValue = tfidf[corpus]')

    fitdata = matutils.corpus2csc(tfidfValue,
                                  num_terms=len(dictionary),
                                  dtype='float32',
                                  num_docs=len(tfidfValue),
                                  num_nnz=None,
                                  printprogress=0).transpose()
    logging.info(
        'finished transfer tfidf vector to sparse vector and transpose sparse vector'
    )

    logging.info('begin kmeans fit')
    #TODO refactor parameters
    km = KMeans(n_clusters=n_clusters,
                init='random',
                max_iter=100,
                n_init=5,
                verbose=1)
    km.fit(fitdata)
    logging.info('finished kmeans fit')
    # l = open('./TFIDF_KmeansTrend.txt','w')
    # for label in km.labels_:
    # 	l.write(str(label)+'\n')
    # l.close()
    return km.labels_
def get_labelled_csc_corpus():
    lsi_300_corpus = gensim.corpora.MmCorpus(
        '../content-preprocessor/lsi_300_corpus.mm')
    labelled_df, labelled_indices, y_matrix = get_labelled_stories()
    labelled_lsi_corpus = lsi_300_corpus[labelled_indices]
    labelled_lsi_csc_corpus = corpus2csc(has_tags_lsi_corpus)
    return labelled_lsi_csc_corpus, y_matrix
Example #10
0
def reduce_nlp_data(vectorizer, data, n_components, reducer):

    transformed_data = vectorizer.fit_transform(data)
    id2word = {
        identifier: word
        for word, identifier in vectorizer.vocabulary_.items()
    }

    if reducer == 'lda':
        corpus = matutils.Sparse2Corpus(transformed_data.transpose())
        lda = models.LdaModel(corpus=corpus,
                              num_topics=n_components,
                              minimum_probability=0.03,
                              id2word=id2word,
                              passes=10,
                              random_state=42)
        print(lda.print_topics())
        lda_corpus = lda[corpus]
        return lda, matutils.corpus2csc(lda_corpus).toarray().transpose()
    elif reducer == 'svd':
        SVD = TruncatedSVD(n_components, n_iter=10, random_state=42)
        svd_data = SVD.fit_transform(transformed_data)
        get_eigenvectors(SVD, id2word)
        return SVD, svd_data
    elif reducer == 'nmf':
        nmf = NMF(n_components, random_state=42)
        nmf_data = nmf.fit_transform(transformed_data)
        get_eigenvectors(nmf, id2word)
        return nmf, nmf_data

    else:
        return None, None
Example #11
0
    def fit(self, corpus):

        self._verify_corpus(corpus)
        self.N = len(corpus)

        tokens = self.preprocessor.transform(corpus)
        self.observed_tokens = tokens.apply(len).sum()

        vocab = Dictionary(tokens)
        vocab.filter_extremes(no_above=self.max_df,
                              no_below=self.min_df,
                              keep_n=self.vocab_size)
        vocab.compactify()

        self.vocab = vocab
        self.corpus_as_tokens = tokens
        self.corpus_as_bow = [self.vocab.doc2bow(doc) for doc in tokens]
        self.corpus_as_csr = corpus2csc(self.corpus_as_bow,
                                        num_terms=len(self.vocab)).T

        self.lengths = [len(d) for d in self.corpus_as_bow]
        self.num_empty_docs = self.lengths.count(0)

        time_now = time.localtime()
        self.created_on = time.strftime("%d %b %Y %H:%M:%S", time_now)

        return self
Example #12
0
def process_data(X):
    """
    :param X: X is a pandas DataFrame of tweets (with no labels)
    :return: The feature matrix where each row is a tweet and each column is a feature. Ready to train/predict.
    """
    tweets = process_texts(X)
    dictionary = corpora.Dictionary.load('dictionary.dict')
    lsi_model = models.LsiModel.load('lsi.model')

    # Transform each tweet (a string) to a row of bag of words vector in the corpus matrix.
    corpus = [dictionary.doc2bow(tweet.split()) for tweet in tweets]

    # transform the bag of words corpus matrix to LSI matrix of features.
    # To read more about LSI - https://en.wikipedia.org/wiki/Latent_semantic_analysis
    feature_mat = corpus2csc(lsi_model[corpus]).T.toarray()

    # Added features
    # feature_mat = count_occ('!', X)
    feature_mat = np.hstack((feature_mat, count_occ('!', X)))
    feature_mat = np.hstack((feature_mat, count_occ('@', X)))
    feature_mat = np.hstack((feature_mat, count_occ('?', X)))
    feature_mat = np.hstack((feature_mat, count_occ('❤', X)))
    feature_mat = np.hstack((feature_mat, count_occ('#', X)))
    feature_mat = np.hstack((feature_mat, count_occ('/https:', X)))
    feature_mat = np.hstack((feature_mat, chars_per_tweet(X)))
    feature_mat = np.hstack((feature_mat, words_per_tweet(X)))
    # feature_mat = zscore(feature_mat, axis=1)
    return feature_mat, feature_mat.shape[1]
Example #13
0
def condensify(train):
    """
    Takes input either a string or a list of string
    Returns a list of all summaries;
    For a string returns a list with singleton document
    """
    summ_list = []
    if isinstance(train,string):
        train = [train]
    for t in train:
        summ=[]
        k=0
        #corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary = corpora.Dictionary([w for w in reuters.sents(t)])
        corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)]
        matrix = matutils.corpus2csc(corpus)
        #print matrix
        u,sigma,vt = sparse.linalg.svds(matrix)
        (k,l)= vt.shape
        while k>=1:
            if reuters.sents(t)[vt[k-1].argmax()] not in summ:
                summ.append(reuters.sents(t)[vt[k-1].argmax()])
            k-=1
        v=[]
        for s in summ:
            v.append(" ".join(s))
        summ = "".join(v)
        summ_list.append(summ)
    return (summ_list)
Example #14
0
def TF_IDF_Reg(corpus_list):

	tfidf_model = models.TfidfModel(corpus_list, normalize=True)
	courpus_list_tfidf = tfidf_model[corpus_list]
	word_matrix = matutils.corpus2csc(courpus_list_tfidf)

	return word_matrix
Example #15
0
    def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(
                        matutils.corpus2csc([doc], self.num_features).T)
                else:
                    doc = matutils.unitvec(
                        matutils.sparse2full(doc, self.num_features))
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i" %
                            len(self.fresh_docs))
Example #16
0
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix):
    corpus = matutils.Dense2Corpus(numpy_matrix)

    numpy_matrix = matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)

    corpus = matutils.Sparse2Corpus(scipy_sparse_matrix)
    scipy_csc_matrix = matutils.corpus2csc(corpus)
Example #17
0
    def _setup(self, corpus):
        """Infer info from the first document and initialize matrices.

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).

        """
        self._h = None

        if isinstance(corpus, scipy.sparse.csc.csc_matrix):
            first_doc = corpus.getcol(0)
        else:
            first_doc_it = itertools.tee(corpus, 1)
            first_doc = next(first_doc_it[0])
            first_doc = matutils.corpus2csc([first_doc], len(self.id2word))
        self.w_std = np.sqrt(first_doc.mean() / (self.num_tokens * self.num_topics))

        self._W = np.abs(
            self.w_std
            * halfnorm.rvs(
                size=(self.num_tokens, self.num_topics), random_state=self.random_state
            )
        )

        self.A = np.zeros((self.num_topics, self.num_topics))
        self.B = np.zeros((self.num_tokens, self.num_topics))
 def get_message_lsi_embedding_vector(self, message):
     test_corpus = [self.dictionary.doc2bow(message.split())]
     test_corpus_tfidf = self.tfidf[test_corpus]
     test_lsi = self.lsi[test_corpus_tfidf]
     test_vector = matutils.corpus2csc(test_lsi)
     message_array = test_vector.toarray().reshape(-1, 1).T
     return message_array
Example #19
0
    def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
                 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
                       matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(
                corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                dtype=dtype, printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr()  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Example #20
0
    def __init__(self, corpus, num_best=None, chunks=500, dtype=numpy.float32,
                 num_terms=None, num_docs=None, num_nnz=None):
        self.num_best = num_best
        self.normalize = True
        self.chunks = chunks

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            self.index = matutils.corpus2csc((matutils.unitvec(vector) for vector in corpus),
                                              num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                                              dtype=numpy.float32, printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR
            logger.info("created %r" % self.index)
Example #21
0
    def _setup(self, corpus):
        """Infer info from the first document and initialize matrices.

        Parameters
        ----------
        corpus : iterable of list(int, float)
            Training corpus.

        """
        self._h, self._r = None, None
        first_doc_it = itertools.tee(corpus, 1)
        first_doc = next(first_doc_it[0])
        first_doc = matutils.corpus2csc([first_doc], len(self.id2word))
        self.w_std = np.sqrt(first_doc.mean() / (self.num_tokens * self.num_topics))

        self._W = np.abs(
            self.w_std
            * halfnorm.rvs(
                size=(self.num_tokens, self.num_topics), random_state=self.random_state
            )
        )

        is_great_enough = self._W > self.w_std * self.sparse_coef

        self._W *= is_great_enough | ~is_great_enough.all(axis=0)

        self._W = scipy.sparse.csc_matrix(self._W)

        self.A = scipy.sparse.csr_matrix((self.num_topics, self.num_topics))
        self.B = scipy.sparse.csc_matrix((self.num_tokens, self.num_topics))
 def getLDaTopics(self,
                  doc,
                  number_of_topics,
                  passe=20,
                  iters=100,
                  chunk=2000,
                  gram=(1, 2),
                  option='c'):
     dictionary, doc_term_matrix = self.prepare_corpus(doc, gram, option)
     # generate LDA model
     lda = models.LdaModel(corpus=doc_term_matrix,
                           id2word=dictionary,
                           num_topics=number_of_topics,
                           iterations=iters,
                           passes=passe,
                           chunksize=chunk,
                           random_state=1)  # train model
     #print(ldamodel.print_topics(),'\n')
     display(lda.print_topics())
     corpus_transformed = lda[doc_term_matrix]
     all_topics_csr = matutils.corpus2csc(corpus_transformed)
     all_topics_numpy = all_topics_csr.T.toarray()
     #Lda_Topic=pd.DataFrame(all_topics_numpy)
     Lda_Topic = pd.DataFrame(all_topics_numpy, doc)
     display(Lda_Topic.head(5))
     print('shape ', Lda_Topic.shape)
     return Lda_Topic
    def __init__(self, m, k, docs = None):
        """
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.
        """

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition 
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if m * k < 10000:
                # SVDLIBC gives spurious results for small matrices.. run full
                # LAPACK svd on them instead
                docs = docs.todense()
                logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                u, s, vt = numpy.linalg.svd(docs, full_matrices = False)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut
            del vt
            k = clipSpectrum(s, self.k)
            self.u, self.s = u[:, :k], s[:k]
        else:
            self.u, self.s = None, None
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS):
        """
        Construct the (U, S) projection from a corpus `docs`. The projection can
        be later updated by merging it with another Projection via `self.merge()`.

        This is the class taking care of the 'core math'; interfacing with corpora,
        splitting large corpora into chunks and merging them etc. is done through
        the higher-level `LsiModel` class.
        """
        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    extra_dims=self.extra_dims)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut, vt
                k = clip_spectrum(s**2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
        else:
            self.u, self.s = None, None
Example #25
0
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS):
        """
        Construct the (U, S) projection from a corpus `docs`. The projection can
        be later updated by merging it with another Projection via `self.merge()`.

        This is the class taking care of the 'core math'; interfacing with corpora,
        splitting large corpora into chunks and merging them etc. is done through
        the higher-level `LsiModel` class.
        """
        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    extra_dims=self.extra_dims)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut, vt
                k = clip_spectrum(s**2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
        else:
            self.u, self.s = None, None
Example #26
0
def _doc_doc_mtx(table, model, input_col, result_type='sparse'):
    corpus = table[input_col].tolist()

    dictionary = model['dictionary']

    bow_corpus = []
    for doc in corpus:
        bow_corpus.append(dictionary.doc2bow(doc))

    csr_matrix = matutils.corpus2csc(bow_corpus).T
    csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))])
    doc_doc = (csr_matrix @ (csr_matrix.T)).tocoo()

    if result_type == 'sparse':
        doc_doc = sparse.triu(doc_doc, k=1)
        out_table = pd.DataFrame(doc_doc.row, columns=['1st_document_idx'])
        out_table['2nd_document_idx'] = doc_doc.col
        out_table['number_of_common_terms'] = doc_doc.data
    elif result_type == 'dense':
        doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))]
        out_table = pd.DataFrame(doc_doc.todense())
        out_table.insert(loc=0, column=' ', value=doc_idx)
        out_table.columns = np.append("", doc_idx)
    else:
        raise_runtime_error("Please check 'result_type'.")

    rb = BrtcReprBuilder()
    model = _model_dict('doc_doc_mtx')
    model['input_col'] = input_col
    model['doc_doc_mtx'] = doc_doc
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table}
    def __init__(self, m, k, docs = None):
        """
        Store (U, S) projection itself. This is the class taking care of 'core math';
        interfacing with corpora, training etc is done through class LsiModel.
        
        `docs` is either a spare matrix or a corpus which, when converted to a 
        sparse matrix, must fit comfortably into main memory.
        """

        self.m, self.k = m, k
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition 
            # in core, algorithm 1
            if utils.isCorpus(docs):
                docs = matutils.corpus2csc(m, docs)
            if m * k < 10000:
                # SVDLIBC gives spurious results for small matrices.. run full
                # LAPACK svd on them instead
                docs = docs.todense()
                logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                u, s, vt = numpy.linalg.svd(docs, full_matrices = False)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                u = ut.T
                del ut
            del vt
            k = clipSpectrum(s, self.k)
            self.u, self.s = u[:, :k], s[:k]
        else:
            self.u, self.s = None, None
Example #28
0
def classify_content(content):
    num_terms = len(dictionary)
    test_corpus = tfidf_model[dictionary.doc2bow(list(Tokenize(content)))]
    test_sparse = matutils.corpus2csc([test_corpus],
                                      num_terms).transpose(copy=False)
    result = sg_class.predict(test_sparse)
    return id2cls[result[0]]
Example #29
0
def train():

    with open('count_corpus.pkl', 'rb') as f:
        id_corpus = pickle.load(f)

    #lda = models.ldamulticore.LdaMulticore(corpus=id_corpus, num_topics=50)

    # lda.save('lda.model')

    p = Pool()
    aaa = p.map(pred, id_corpus)
    result = numpy.asarray(corpus2csc(aaa).T.todense())
    p.close()
    p.join
    map_train, map_test, train_num = make_idmap()

    df = pandas.read_csv('../data/train_clean.csv')[['question1', 'question2'
                                                     ]].fillna('').values
    df_train = pandas.DataFrame(_train(result[:train_num], df, map_train))
    df_train.to_csv('lda_train.csv', index=False)

    df = pandas.read_csv('../data/test_clean.csv')[['question1', 'question2'
                                                    ]].fillna('').values
    df_test = pandas.DataFrame(_train(result[train_num:], df, map_test))
    df_test.to_csv('lda_test.csv', index=False)
Example #30
0
def tfidf(counts_mat):
    m,n = counts_mat.shape
    # utilize gensim
    corpus = matutils.Sparse2Corpus(counts_mat)
    tfidf = models.logentropy_model.LogEntropyModel(corpus)
    c_tfidf = tfidf[corpus]
    return matutils.corpus2csc(c_tfidf)
Example #31
0
def process_records(records, fields, target, textmodel=None):
	tokenize = CountVectorizer().build_analyzer()

	input = None
	X = None
	y_labels = []

	for i, record in enumerate(records):
		nums = []
		strs = []
		y_labels.append(record.get(target))

		for field in fields:
			if is_number(record.get(field)):
				nums.append(record[field])
			else:
				strs.append(str(record.get(field) or "").lower())
		if strs:
			if input is None:
				input = StringIO.StringIO()
			print >> input, " ".join(tokenize(" ".join(strs)))
		if nums:
			if X is None:
				X = sp.lil_matrix((len(records),len(nums)))
			X[i] = np.array(nums, dtype=np.float64)

	if input is not None:
		if X is not None:
			X_2 = X.tocsr()
		else:
			X_2 = None

		if isinstance(textmodel,basestring):
			if textmodel == 'lsi':
				corpus = TextCorpus(input)
				textmodel = LsiModel(corpus, chunksize=1000)
			elif textmodel == 'tfidf':
				corpus = TextCorpus(input)
				textmodel = TfidfModel(corpus)
			elif textmodel == 'hashing':
				textmodel = None
				hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
				input.seek(0)
				X = hasher.transform(tokenize(line.strip()) for line in input)
		if textmodel:
			num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
			X = corpus2csc(textmodel[corpus], num_terms).transpose()

		if X_2 is not None:
			# print >> sys.stderr, "X SHAPE:", X.shape
			# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
			X = sp.hstack([X, X_2], format='csr')

	elif X is not None:
		textmodel = None
		X = X.tocsr()

	print >> sys.stderr, "X SHAPE:", X.shape

	return X, y_labels, textmodel
Example #32
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform 256 documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # 256 smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms)

        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist.flat)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Example #33
0
def condensify(train):
    """
    Takes input either a string or a list of string
    Returns a list of all summaries;
    For a string returns a list with singleton document
    """
    summ_list = []
    if isinstance(train, string):
        train = [train]
    for t in train:
        summ = []
        k = 0
        #corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary = corpora.Dictionary([w for w in reuters.sents(t)])
        corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)]
        matrix = matutils.corpus2csc(corpus)
        #print matrix
        u, sigma, vt = sparse.linalg.svds(matrix)
        (k, l) = vt.shape
        while k >= 1:
            if reuters.sents(t)[vt[k - 1].argmax()] not in summ:
                summ.append(reuters.sents(t)[vt[k - 1].argmax()])
            k -= 1
        v = []
        for s in summ:
            v.append(" ".join(s))
        summ = "".join(v)
        summ_list.append(summ)
    return (summ_list)
Example #34
0
def train(args, output_dir):
    """Build the corpus, trains the DTM, and saves the model to the output
    dir."""
    corpus = Corpus()

    # Create the dictionary.
    dictionary = Dictionary(corpus.debates.bag_of_words)
    dictionary.filter_extremes(no_below=100)

    # Save empirical term distribution within each time step.
    term_counts = corpus2csc(
        corpus.debates.groupby('year').agg({
            'bag_of_words': 'sum'
        }).bag_of_words.apply(dictionary.doc2bow))
    save_npz(os.path.join(output_dir, 'term_counts.npz'), term_counts)

    # Train and save dtm.
    time_slices = corpus.debates.groupby('year').size()
    dtm_corpus = corpus.debates.bag_of_words.apply(dictionary.doc2bow)
    model = Dtm(args.executable,
                corpus=dtm_corpus,
                id2word=dictionary,
                num_topics=args.num_topics,
                time_slices=time_slices.values,
                rng_seed=args.random_seed)
    model.save(os.path.join(output_dir, 'dtm.gensim'))
Example #35
0
    def _transform(self, corpus, source_dict=None):
        temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
        dic = corpora.Dictionary(
            temp_corpus, prune_at=None) if not source_dict else source_dict
        temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
        model = models.TfidfModel(temp_corpus,
                                  normalize=False,
                                  wlocal=self.wlocals[self.wlocal],
                                  wglobal=self.wglobals[self.wglobal])

        X = matutils.corpus2csc(model[temp_corpus],
                                dtype=np.float,
                                num_terms=len(dic)).T
        norm = self.norms[self.norm]
        if norm:
            X = norm(X)

        # set compute values
        shared_cv = SharedTransform(self,
                                    corpus.used_preprocessor,
                                    source_dict=dic)
        cv = [
            VectorizationComputeValue(shared_cv, dic[i])
            for i in range(len(dic))
        ]

        corpus = self.add_features(corpus,
                                   X,
                                   dic,
                                   cv,
                                   var_attrs={'bow-feature': True})
        return corpus
Example #36
0
    def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000):
        is_corpus, current_representation = utils.is_corpus(current_representation)
        if is_corpus:
            for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)):
                ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk))
                assert num_terms is not None, "Need num_terms to properly handle sparse corpus format"
                chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms)

                ln.debug("Chunk converted to csc, running through layer..")
                chunk_trans = layer.__getitem__(chunk_as_csc)

                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("Finished serializing chunk. Processed %s documents so far." %
                         (chunk_no * chunksize + len(chunk)))
        else:
            ln.info("Beginning serialization of non-gensim corpus format intermediate representation.")
            ln.debug("Type of current_representation is %s" % type(current_representation))
            for chunk_no, chunk in enumerate(current_representation):
                ln.debug("converting chunk (%s documents)..." % chunksize)
                chunk_trans = layer.__getitem__(chunk)
                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("finished serializing chunk.")

        ln.info("Finished serializing all chunks.")
Example #37
0
    def _get_vectors(self):
        processed_corpus = [self._tokenize(doc["document"], self.phraser) for doc in self.docs]
        self.dictionary = corpora.Dictionary(processed_corpus)

        # ignore 20% most frequent words
        # num_unique_words = len(dictionary)
        # dictionary.filter_n_most_frequent(int(num_unique_words*0.2))

        # do some more filtering and keep only n most frequent specified with num_dims parameter
        self.dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=self.num_dims)

        bow_corpus = [self.dictionary.doc2bow(text) for text in processed_corpus]

        if self.vectorizer == "TfIdf Vectorizer":
            self.tfidf_model = models.TfidfModel(bow_corpus)
            transformed_corpus = self.tfidf_model[bow_corpus]
        elif self.vectorizer == "Count Vectorizer":
            transformed_corpus = bow_corpus

        if self.use_lsi:
            self.lsi_model = models.LsiModel(transformed_corpus, id2word=self.dictionary, num_topics=self.num_topics)
            transformed_corpus = self.lsi_model[transformed_corpus]

        matrix = corpus2csc(transformed_corpus, num_terms=len(self.dictionary.keys()), num_docs=self.dictionary.num_docs)
        return matrix.transpose()
Example #38
0
    def get_intracluster_similarity(self, new_documents=[], phraser=None):
        if len(new_documents) > 0:
            dictionary = self.models["dictionary"]

            for doc in new_documents:
                processed_text = Clustering._tokenize(doc["text"], phraser=phraser)
                doc_vec = [dictionary.doc2bow(processed_text)]

                if self.models["tfidf_model"] is not None:
                    doc_vec = self.models["tfidf_model"][doc_vec]

                if self.models["lsi_model"] is not None:
                    doc_vec = self.models["lsi_model"][doc_vec]

                full_vec = corpus2csc(doc_vec, num_terms=len(dictionary.keys()), num_docs=dictionary.num_docs)
                full_vec = full_vec.transpose()
                self.models["doc_vectors"][doc["id"]] = full_vec[0].toarray()[0]

            self._save_updated_models()

        if self.doc_ids:
            cluster_vectors = []
            for doc_id in self.doc_ids:
                cluster_vectors.append(self.models["doc_vectors"][doc_id])
            similarities = cosine_similarity(cluster_vectors)
            return np.mean(similarities)
        else:
            return 0
Example #39
0
    def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
Example #40
0
    def update(self, corpus):
        """Train the model with new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).

        """
        if self._W is None:
            self._setup(corpus)

        chunk_idx = 1

        for _ in range(self.passes):
            if isinstance(corpus, scipy.sparse.csc.csc_matrix):
                grouper = (
                    corpus[:, col_idx:col_idx + self.chunksize]
                    for col_idx
                    in range(0, corpus.shape[1], self.chunksize)
                )
            else:
                grouper = utils.grouper(corpus, self.chunksize)

            for chunk in grouper:
                if isinstance(corpus, scipy.sparse.csc.csc_matrix):
                    v = chunk[:, self.random_state.permutation(chunk.shape[1])]
                else:
                    self.random_state.shuffle(chunk)

                    v = matutils.corpus2csc(
                        chunk,
                        num_terms=self.num_tokens,
                    )

                self._h = self._solveproj(v, self._W, h=self._h, v_max=self.v_max)
                h = self._h

                self.A *= chunk_idx - 1
                self.A += h.dot(h.T)
                self.A /= chunk_idx

                self.B *= chunk_idx - 1
                self.B += v.dot(h.T)
                self.B /= chunk_idx

                prev_w_error = self._w_error

                self._solve_w()

                if chunk_idx % self.eval_every == 0:
                    logger.info("Loss: {}".format(self._w_error / prev_w_error))

                chunk_idx += 1

        logger.info("Loss: {}".format(self._w_error / prev_w_error))
def main():
    parser = argparse.ArgumentParser(
        description=
        'calculates various stats and of a given document-author-contribs file'
    )
    parser.add_argument(
        '--acc-contribs',
        type=argparse.FileType('r'),
        help='path to input MatrixMarket acc contributions file (.mm/.mm.bz2)',
        required=True)
    parser.add_argument('--img-prefix',
                        help='prefix of output generated img files',
                        required=True)
    parser.add_argument('--quantile-order',
                        type=float,
                        help='quantile of histrograms to consider',
                        required=True)

    args = parser.parse_args()
    input_acc_contribs_dump_path = args.acc_contribs.name
    output_image_prefix = args.img_prefix
    quantile_order = args.quantile_order

    logger.info('running with:\n{}'.format(
        pformat({
            'input_acc_contribs_dump_path': input_acc_contribs_dump_path,
            'output_image_prefix': output_image_prefix,
            'quantile_order': quantile_order
        })))

    acc_contribs = MmCorpus(input_acc_contribs_dump_path)
    logger.info('reading corpus to sparse csr matrix')
    csr_corpus = corpus2csc(acc_contribs).T.tocsr()
    logger.info('generated sparse matrix of shape {}'.format(csr_corpus.shape))
    logger.debug('sparse matrix \n{}'.format(csr_corpus))

    logger.info('calculating authors-per-docs-distribution')
    num_authors_per_doc = sp.find((csr_corpus > 0).sum(1))[2]
    quantile = get_quantile(num_authors_per_doc, quantile_order)
    num_authors_per_doc = num_authors_per_doc[num_authors_per_doc <= quantile]
    num_authors_per_doc_imgfile = output_image_prefix + '-num-auths-per-doc-dist.pdf'
    xlabel = 'Autoren je Dokument'
    ylabel = 'Häufigkeit'
    num_authors, num_authors_counts = np.unique(num_authors_per_doc,
                                                return_counts=True)
    bar_plot(num_authors, num_authors_counts, num_authors_per_doc_imgfile,
             xlabel, ylabel)

    logger.info('calculating docs-per-authors-distribution')
    num_docs_per_author = sp.find((csr_corpus > 0).sum(0).T)[2]
    quantile = get_quantile(num_docs_per_author, quantile_order)
    num_docs_per_author = num_docs_per_author[num_docs_per_author <= quantile]
    num_docs_per_author_imgfile = output_image_prefix + '-num-docs-per-auth-dist.pdf'
    xlabel = 'Dokumente je Autor'
    ylabel = 'Häufigkeit'
    num_docs, num_docs_counts = np.unique(num_docs_per_author,
                                          return_counts=True)
    bar_plot(num_docs, num_docs_counts, num_docs_per_author_imgfile, xlabel,
             ylabel)
Example #42
0
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
    if not matutils.ismatrix(corpus):
        corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary))
    else:
        corpus_csc = corpus
        # Need corpus to be a streaming gensim list corpus for len and inference functions below:
        corpus = matutils.Sparse2Corpus(corpus_csc)

    beta = 0.01
    fnames_argsort = np.asarray(list(dictionary.token2id.values()),
                                dtype=np.int_)
    term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
    term_freqs[term_freqs == 0] = beta
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    assert term_freqs.shape[0] == len(
        dictionary
    ), 'Term frequencies and dictionary have different shape {} != {}'.format(
        term_freqs.shape[0], len(dictionary))
    assert doc_lengths.shape[0] == len(
        corpus
    ), 'Document lengths and corpus have different sizes {} != {}'.format(
        doc_lengths.shape[0], len(corpus))

    if hasattr(topic_model, 'lda_alpha'):
        num_topics = len(topic_model.lda_alpha)
    else:
        num_topics = topic_model.num_topics

    if doc_topic_dists is None:
        # If its an HDP model.
        if hasattr(topic_model, 'lda_beta'):
            gamma = topic_model.inference(corpus)
        else:
            gamma, _ = topic_model.inference(corpus)
        doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
    else:
        if isinstance(doc_topic_dists, list):
            doc_topic_dists = matutils.corpus2dense(doc_topic_dists,
                                                    num_topics).T
        elif issparse(doc_topic_dists):
            doc_topic_dists = doc_topic_dists.T.todense()
        doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

    assert doc_topic_dists.shape[
        1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(
            doc_topic_dists.shape[1], num_topics)

    # get the topic-term distribution straight from gensim without
    # iterating over tuples
    if hasattr(topic_model, 'lda_beta'):
        topic = topic_model.lda_beta
    else:
        topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]

    assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]
    return doc_topic_dists
Example #43
0
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix (if maintain_sparsity=False) **OR**
        :class:`scipy.sparse.csc`
            otherwise

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.T  # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
            else:
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc()  # N x T * T x C = N x C
        if result.shape[1] == 1 and not is_corpus:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        elif self.maintain_sparsity:
            # avoid converting to dense array if maintaining sparsity
            result = result.T
        else:
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
Example #44
0
 def transform(self, X):
     x_clean = tu.clean_and_tokenize(X, stoplist=self.stoplist)
     x_tfidf = self.tfidf[[self.dictionary.doc2bow(text) for text in x_clean]]
     x_data = matutils.corpus2csc(x_tfidf, num_terms=len(self.dictionary)).T
     #x_data = matutils.corpus2dense(x_tfidf, num_terms=len(self.dictionary)).T
     #logging.info("Returning data of shape %s " % (len(x_data)))
     #returning a csr matrix
     return x_data
Example #45
0
def setup(files):
    # setup the output directory
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_dir = '../browser/json/' + base_model_name + '/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # load the topic model
    model = LdaModel.load(files.model)
    # load replacements used
    bug_to_id = json.loads(open(files.replacements).read())
    # invert to id<->bug map, ditching s. genus terms
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}
    # load the docsXwords and docsXtopics matrices (in sparse format)
    corpus = mmcorpus.MmCorpus(files.corpus)
    docsXwords_sparse = corpus2csc(corpus, num_terms=len(model.id2word.token2id)).T
    docsXtopics = mmcorpus.MmCorpus(files.docsXtopics)
    docsXtopics_sparse = corpus2csc(docsXtopics).T
    return docsXtopics_sparse, docsXwords_sparse, id_to_bug, model, output_dir
def hierarchical_clustering(corpus_fn, n_clusters=2, linkage='complete'):
    corpus = corpora.MmCorpus(corpus_fn)
    corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose()
    svd = TruncatedSVD(n_components=100)
    new_corpus = svd.fit_transform(corpus)
    knn_graph = kneighbors_graph(new_corpus, 10, metric='euclidean')
    agg = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=linkage, connectivity=knn_graph)
    agg.fit(new_corpus)
    return corpus, agg.labels_
Example #47
0
def convert_to_X_y(model_class, params, data, label):
	
	model = model_class(**params)
	mat = model[data]

	X = corpus2csc(mat)
	y = np.array(label)

	return X.T, y
Example #48
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.

        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).

        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype)
        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse),
        # but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Example #49
0
 def readTest(self, test_deals_file):
     """
     read test data
     @param test_deals
     @return
     """
     corpus = [self.dict.doc2bow(line.split()) for line in open(test_deals_file, 'r')]
     self.X_test = matutils.corpus2csc(corpus, num_terms=len(self.dict)).T
     #joblib.dump(self.X_test, 'task3_X_test.pkl')
     print 'readTest Done!'
Example #50
0
def load_sparse_dataset(colname,suffix):
    corpus = corpora.MmCorpus('vsm/{}.{}'.format(colname,suffix))
    # with documents as columns
    X = matutils.corpus2csc(corpus,printprogress=1)
    # transpose to make each document a row
    X = X.T

    y = pd.read_csv("vsm/{}_meta.csv".format(colname),index_col='id')
    y = y.iloc[:,0]# DataFrame to Series

    return X,y
Example #51
0
def evaluate(corpus_fn, labels_fn):
    corpus = corpora.MmCorpus(corpus_fn)
    labels = pickle.load(open(labels_fn))
    corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose()
    scores = []
    for i in range(16):
        score = metrics.silhouette_score(corpus, labels, metric='cosine', sample_size=5000)
        print score
        scores.append(score)
    sc = numpy.array(scores).mean()
    print 'mean', sc
Example #52
0
def load_tfidf(tag):
    corpus = corpora.MmCorpus("{}.tfidf".format(tag))
    # with documents as columns
    X = matutils.corpus2csc(corpus, printprogress=1)
    # transpose to make each document a row
    X = X.T

    y = pd.read_csv("{}_labels.csv".format(tag), index_col="date")
    y = y.iloc[:, 0]  # DataFrame to Series

    return X, y
Example #53
0
def key_words(keys, topk=18):
    vec_bow = dictionary.doc2bow(keys)
    tfidf_corpus = tfidf_model[vec_bow]

    num_terms = len(dictionary)
    test_sparse = matutils.corpus2csc([tfidf_corpus], num_terms).transpose(copy=False)
    result = sg_class.predict(test_sparse)

    words = [dictionary[d].decode("utf-8") for d, _ in sorted(list(tfidf_corpus), key=lambda item: -item[1])[0:topk]]
    classify = id2cls[result[0]]

    return (words, classify)
Example #54
0
        def transformed_corpus():
            for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)):
                ln.debug("Converting chunk %s to csc format.." % chunk_no)
                chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality)
                ln.debug("Computing hidden representation for chunk.. ")
                hidden = self._get_hidden_representations(chunk)
                ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" %
                        (chunk_no, chunk_no * chunksize + len(doc_chunk)))
                for column in hidden.T:
                    yield matutils.dense2vec(column.T)
                ln.debug("Done yielding chunk %s" % chunk_no)

            ln.info("Finished computing representations for all chunks.")
Example #55
0
    def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS,
                 extra_dims=P2_EXTRA_DIMS, dtype=np.float64):
        """Construct the (U, S) projection from a corpus.

        Parameters
        ----------
        m : int
            Number of features (terms) in the corpus.
        k : int
            Desired rank of the decomposed matrix.
        docs : {iterable of list of (int, float), scipy.sparse.csc}
            Corpus in BoW format or as sparse matrix.
        use_svdlibc : bool, optional
            If True - will use `sparsesvd library <https://pypi.python.org/pypi/sparsesvd/>`_,
            otherwise - our own version will be used.
        power_iters: int, optional
            Number of power iteration steps to be used. Tune to improve accuracy.
        extra_dims : int, optional
            Extra samples to be used besides the rank `k`. Tune to improve accuracy.
        dtype : numpy.dtype, optional
            Enforces a type for elements of the decomposed matrix.

        """
        self.m, self.k = m, k
        self.power_iters = power_iters
        self.extra_dims = extra_dims
        if docs is not None:
            # base case decomposition: given a job `docs`, compute its decomposition,
            # *in-core*.
            if not use_svdlibc:
                u, s = stochastic_svd(
                    docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    extra_dims=self.extra_dims, dtype=dtype)
            else:
                try:
                    import sparsesvd
                except ImportError:
                    raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`")
                logger.info("computing sparse SVD of %s matrix", str(docs.shape))
                if not scipy.sparse.issparse(docs):
                    docs = matutils.corpus2csc(docs)
                # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                ut, s, vt = sparsesvd.sparsesvd(docs, k + 30)
                u = ut.T
                del ut, vt
                k = clip_spectrum(s ** 2, self.k)
            self.u = u[:, :k].copy()
            self.s = s[:k].copy()
        else:
            self.u, self.s = None, None
Example #56
0
def main(model, dic, corpus, output):
    logging.basicConfig(level=logging.INFO)
    score = tfidf.scorer(model, dic)
    transforms = []
    with open(corpus) as fp:
        n_sentences = sum(1 for line in fp)
    logging.info('Computing tf-idf vectors for %d sentences', n_sentences)
    bar = pb.ProgressBar(widgets=[pb.Percentage(), pb.Bar(), pb.ETA()], maxval=n_sentences)
    with open(corpus) as fp:
        for sentence in bar(fp):
            transforms.append(score(sentence.split()))
    logging.info('Saving tf-idf information to %s', output)
    with open(output, 'w') as fp:
        cPickle.dump(corpus2csc(transforms), fp, protocol=cPickle.HIGHEST_PROTOCOL)
Example #57
0
    def add_documents(self, corpus):
        """Extend the index with new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in BoW format.

        Notes
        -----
        Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them
        (or when a query is issued).

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora.textcorpus import TextCorpus
            >>> from gensim.test.utils import datapath, get_tmpfile
            >>> from gensim.similarities import Similarity
            >>>
            >>> corpus = TextCorpus(datapath('testcorpus.mm'))
            >>> index_temp = get_tmpfile("index")
            >>> index = Similarity(index_temp, corpus, num_features=400)  # create index
            >>>
            >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt'))
            >>> index.add_documents(one_more_corpus)  # add more documents in corpus

        """
        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
Example #58
0
    def get_document_topics(self, bow, minimum_probability=None,
                            normalize=None):
        """Get the topic distribution for the given document.

        Parameters
        ----------
        bow : list of (int, float)
            The document in BOW format.
        minimum_probability : float
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of (int, float)
            Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and
            the probability that was assigned to it.

        """
        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)

        if is_corpus:
            kwargs = dict(minimum_probability=minimum_probability)
            return self._apply(corpus, **kwargs)

        v = matutils.corpus2csc([bow], self.num_tokens)
        h = self._solveproj(v, self._W, v_max=np.inf)

        if normalize is None:
            normalize = self.normalize
        if normalize:
            the_sum = h.sum()
            if the_sum:
                h /= the_sum

        return [
            (idx, proba)
            for idx, proba in enumerate(h[:, 0])
            if not minimum_probability or proba > minimum_probability
        ]
 def __init__(self, m, k, docs = None, algo = 'onepass', chunks = None):
     """
     Construct the (U, S) projection from a corpus `docs`. 
     
     This is the class taking care of 'core math'; interfacing with corpora, 
     chunking large corpora etc. is done through the LsiModel class.
     
     `algo` is currently one of:
     
       * 'onepass'; only a single pass over `docs` is needed
       * 'twopass'; multiple passes over the input allowed => can use a 
         faster algorithm.
     """
     self.m, self.k = m, k
     if docs is not None:
         # base case decomposition: given a job `docs`, compute its decomposition in-core
         # results of several base case decompositions can be merged via `self.merge()`
         if algo == 'twopass':
             self.u, self.s = stochasticSvd(docs, k, chunks = chunks, num_terms = m)
         elif algo == 'onepass':
             if not scipy.sparse.issparse(docs):
                 docs = matutils.corpus2csc(docs, num_terms = m)
             if docs.shape[1] <= max(k, 100):
                 # For sufficiently small chunk size, update directly like `svd(now, docs)` 
                 # instead of `svd(now, svd(docs))`.
                 # This improves accuracy and is also faster for small chunks, because
                 # we need to perform one less svd.
                 # On larger chunks this doesn't work because we quickly run out of memory.
                 self.u = docs
                 self.s = None
             else:
                 try:
                     import sparsesvd
                 except ImportError:
                     raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                 logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                 ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                 u = ut.T
                 del ut, vt
                 k = clipSpectrum(s ** 2, self.k)
                 self.u, self.s = u[:, :k], s[:k]
         else:
             raise NotImplementedError("unknown decomposition algorithm: '%s'" % algo)
     else:
         self.u, self.s = None, None