Exemple #1
0
    def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunks=256):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.info("scanning corpus to determine the number of features")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunks = chunks

        if corpus is not None:
            logger.info("creating matrix for %s documents and %i features" %
                         (len(corpus), num_features))
            self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
                self.index[docno] = matutils.unitvec(matutils.sparse2full(vector, num_features))
Exemple #2
0
    def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None):
        """

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in streamed Gensim bag-of-words format.
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        num_features : int
            Size of the dictionary (number of features).
        corpus_len : int, optional
            Number of documents in `corpus`. If not specified, will scan the corpus to determine the matrix size.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        dtype : numpy.dtype, optional
            Datatype to store the internal matrix in.

        """
        if num_features is None:
            logger.warning(
                "scanning corpus to determine the number of features (consider setting `num_features` explicitly)"
            )
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        if corpus_len is None:
            corpus_len = len(corpus)

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError(
                    "cannot index a corpus with zero features (you must specify either `num_features` "
                    "or a non-empty corpus in the constructor)"
                )
            logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
            self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len)
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector
Exemple #3
0
    def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None):
        """

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in streamed Gensim bag-of-words format.
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        num_features : int
            Size of the dictionary (number of features).
        corpus_len : int, optional
            Number of documents in `corpus`. If not specified, will scan the corpus to determine the matrix size.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        dtype : numpy.dtype, optional
            Datatype to store the internal matrix in.

        """
        if num_features is None:
            logger.warning(
                "scanning corpus to determine the number of features (consider setting `num_features` explicitly)"
            )
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        if corpus_len is None:
            corpus_len = len(corpus)

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError(
                    "cannot index a corpus with zero features (you must specify either `num_features` "
                    "or a non-empty corpus in the constructor)"
                )
            logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
            self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len)
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector
Exemple #4
0
    def __init__(self,
                 corpus,
                 num_best=None,
                 dtype=numpy.float32,
                 num_features=None,
                 chunksize=256,
                 corpus_len=None):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.warning(
                "scanning corpus to determine the number of features (consider setting `num_features` explicitly)"
            )
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        if corpus_len is None:
            corpus_len = len(corpus)

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError(
                    "cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)"
                )
            logger.info("creating matrix with %i documents and %i features",
                        corpus_len, num_features)
            self.index = numpy.empty(shape=(corpus_len, num_features),
                                     dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i", docno,
                                 corpus_len)
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(
                        matutils.sparse2full(vector, num_features))
                self.index[docno] = vector
Exemple #5
0
    def __init__(self, corpus, document_titles, num_clusters=None,
                 num_features=None):
        """
        Computes the interpreter matrix by calculating the TF-IDF value of each
        token in each concept (doc) in corpus.
        
        document_titles give the names of each concept (doc) in corpus.
        num_features gives the number of features of corpus
        
        If num_clusters == None all documents are used as concepts.
        """

        if not num_clusters:
            self.num_clusters = len(document_titles)
        else:
            self.num_clusters = num_clusters

        if num_features is None:
            logger.info("scanning corpus to determine the number of features")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features

        #reduce column count by k-medoid clustering and using medoid of each cluster
        #TODO: skip clustering when num_clusters == None
        clusterer = KMedoids(corpus=corpus,
                             num_features=self.num_features,
                             num_clusters=self.num_clusters,
                             max_iterations=10)
        clusters = clusterer.cluster()

        #set the corpus to medoids
        #the corpus is our interpreter matrix. It is not sparse
        #each column is a doc and is seen as a concept
        self.corpus = clusterer.get_medoids().T


        #reduce document titles
        self.document_titles = DocumentTitles()
        for cluster_id in clusters.iterkeys():
            self.document_titles.append(document_titles[cluster_id] or "no title")

        #print clusters with their members
        for cluster_id, members in clusters.iteritems():
            cluster_title = document_titles[cluster_id]
            member_titles = ", ".join(document_titles[member_id]
                                      for member_id
                                      in members)
            logger.debug("%s: %s" % (cluster_title, member_titles))
Exemple #6
0
    def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        if corpus_len is None:
            corpus_len = len(corpus)

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError(
                    "cannot index a corpus with zero features (you must specify either `num_features` "
                    "or a non-empty corpus in the constructor)"
                )
            logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
            self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len)
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector
    def __init__(self,
                 corpus,
                 document_titles,
                 test_corpus,
                 test_corpus_targets,
                 num_test_corpus,
                 num_best_features=1000,
                 num_features=None,
                 tmp_path='complete_similarity'):
        """
        The similarity between a document and each document of the corpus is
        the feature created.

        The corpus is filtered for significant features.

        Parameters
        ----------
        corpus : The corpus which contains all concepts. E.g. English Wikipedia
                 in TF-IDF space.
        test_corpus : The test corpus is used to select features.
                      All documents in this corpus should be classified.
        test_corpus_targets : The target classes of each document in the
                              test corpus.
        num_test_corpus : Number of documents in the test corpus.
        document_titles : The names of each concept (doc) in corpus.
        num_features : The number of features of corpus
        num_best_features : Number of features which should be selected for cESA model.
                            If one wants to use all concepts as features the she has
                            to set num_best_features to the size of corpus.
        """

        if num_features is None:
            logger.info("scanning corpus to determine the number of features")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features

        # create similarity index of complete corpus
        complete_similarity_index = Similarity(output_prefix=tmp_path,
                                               corpus=corpus,
                                               num_features=self.num_features)

        # reduce concept count by feature selection
        self.selector = iSelectKBest(if_classif, k=num_best_features)

        #transform each document of test_corpus
        logger.info("Test corpus of %d documents..." % num_test_corpus)

        transformed_test_corpus = (complete_similarity_index[doc]
                                   for doc
                                   in test_corpus)

        logger.info("Select best features...")
        X_y = izip(transformed_test_corpus, test_corpus_targets)
        self.selector.fit(X_y, len(document_titles))

        logger.info("Done selecting.")

        #reduce similarity index
        selected_documents = [doc
                              for doc, mask
                              in itertools.izip(corpus,
                                                self.selector.get_support())
                              if mask]
        self.similarity_index = MatrixSimilarity(corpus=selected_documents,
                                                 num_features=self.num_features)

        #reduce document titles
        self.document_titles = DocumentTitles()
        for doc_title, mask in itertools.izip(document_titles,
                                              self.selector.get_support()):
            if mask:
                self.document_titles.append(doc_title)

        #print doc titles
        for title in self.document_titles:
            logger.debug("%s" % title)
Exemple #8
0
    corpora.MmCorpus.serialize('corpus/'+doc_file.split('/')[-1]+str(num_topics_lsi)+\
            '.lda.mm._f'+str(freq_th), corpus_lda)
    return corpus_lda

if __name__=='__main__':
    if create_dictionary:
        dictionary = create_dict(dict_file)
        print("***Dictionary is created")
    else:
        dictionary = corpora.Dictionary.load('dictionary/'+dict_file.split('/')[-1]+'.gensim')
        print("***Dictionary is loaded")

    if create_corpus_bow:
        corpus_bow = create_corp_bow(doc_file)
        print("***BOW corpus is created")
        num_features = 1 + utils.get_max_id(corpus_bow)
        index_bow = similarities.Similarity(output_prefix=None, corpus=corpus_bow, num_features=num_features)
        print("***BOW corpus index is created")
        index_bow.save('index/'+doc_file.split('/')[-1]+'.bow.index._f'+str(freq_th))
    else:
        #load BOW corpus
        corpus_bow = corpora.MmCorpus('corpus/'+doc_file.split('/')[-1]+'.bow.mm._f'+str(freq_th))
        print("***BOW corpus is loaded")
        index_bow = similarities.Similarity.load('index/'+doc_file.split('/')[-1]+'.bow.index._f'+\
                str(freq_th))
        print("***BOW corpus index is loaded")
        dictionary = corpora.Dictionary.load('dictionary/'+doc_file.split('/')[-1]+'.dict._f'+\
                str(freq_th))
        print("***Dictionary2 is loaded")

    if create_corpus_tfidf: