def __init__(self, corpus, numBest = None, dtype = numpy.float32):
     """
     If numBest is left unspecified, similarity queries return a full list (one 
     float for every document in the corpus, including the query document).
     
     If numBest is set, queries return numBest most similar documents, as a 
     sorted list, eg. [(docIndex1, 1.0), (docIndex2, 0.95), ..., (docIndexnumBest, 0.45)].
     """
     logging.info("creating sparse matrix for %i documents" % len(corpus))
     self.numBest = numBest
     self.corpus = scipy.sparse.lil_matrix((len(corpus), 1), dtype = dtype) # set no of columns to 1 for now, as the number of terms is unknown yet
     self.normalize = False
     
     # iterate over the corpus, populating the sparse matrix
     for docNo, vector in enumerate(corpus):
         if docNo % 10000 == 0:
             logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus)))
         vector = matutils.unitVec(vector) # make all vectors unit length, so that cosine similarity = simple dot product
         self.corpus.rows[docNo] = [termId for termId, _ in vector]
         self.corpus.data[docNo] = [dtype(val) for _, val in vector]
     
     # now set the shape properly, using no. columns = highest term index in the corpus + 1
     numTerms = 1 + max(max(row + [-1]) for row in self.corpus.rows) # + [-1] to avoid exceptions from max(empty)
     self.corpus._shape = (len(corpus), numTerms)
     
     # convert to Compressed Sparse Row for efficient row slicing and multiplications
     self.corpus = self.corpus.tocsr()
     logging.info("created %s" % repr(self.corpus))
Esempio n. 2
0
    def __init__(self, corpus, numBest=None, dtype=numpy.float32):
        """
        If numBest is left unspecified, similarity queries return a full list (one 
        float for every document in the corpus, including the query document).
        
        If numBest is set, queries return numBest most similar documents, as a 
        sorted list, eg. [(docIndex1, 1.0), (docIndex2, 0.95), ..., (docIndexnumBest, 0.45)].
        """
        logging.info("creating sparse matrix for %i documents" % len(corpus))
        self.numBest = numBest
        self.corpus = scipy.sparse.lil_matrix(
            (len(corpus), 1), dtype=dtype
        )  # set no of columns to 1 for now, as the number of terms is unknown yet
        self.normalize = False

        # iterate over the corpus, populating the sparse matrix
        for docNo, vector in enumerate(corpus):
            if docNo % 10000 == 0:
                logging.info("PROGRESS: at document #%i/%i" %
                             (docNo, len(corpus)))
            vector = matutils.unitVec(
                vector
            )  # make all vectors unit length, so that cosine similarity = simple dot product
            self.corpus.rows[docNo] = [termId for termId, _ in vector]
            self.corpus.data[docNo] = [dtype(val) for _, val in vector]

        # now set the shape properly, using no. columns = highest term index in the corpus + 1
        numTerms = 1 + max(max(row + [-1]) for row in self.corpus.rows
                           )  # + [-1] to avoid exceptions from max(empty)
        self.corpus._shape = (len(corpus), numTerms)

        # convert to Compressed Sparse Row for efficient row slicing and multiplications
        self.corpus = self.corpus.tocsr()
        logging.info("created %s" % repr(self.corpus))
 def __init__(self, corpus, numBest = None, dtype = numpy.float32, numFeatures = None):
     """
     If `numBest` is left unspecified, similarity queries return a full list (one 
     float for every document in the corpus, including the query document):
     
     If `numBest` is set, queries return `numBest` most similar documents, as a 
     sorted list:
     
     >>> sms = MatrixSimilarity(corpus, numBest = 3)
     >>> sms[vec12]
     [(12, 1.0), (30, 0.95), (5, 0.45)]
     
     """
     if numFeatures is None:
         logging.info("scanning corpus of %i documents to determine the number of features" %
                      len(corpus))
         numFeatures = 1 + utils.getMaxId(corpus)
         
     logging.info("creating matrix for %i documents and %i features" % 
                  (len(corpus), numFeatures))
     self.numFeatures = numFeatures
     self.numBest = numBest
     self.corpus = numpy.empty(shape = (len(corpus), numFeatures), dtype = dtype, order = 'F')
     self.normalize = True
     
     # iterate over corpus, populating the numpy matrix
     for docNo, vector in enumerate(corpus):
         if docNo % 1000 == 0:
             logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus)))
         vector = matutils.unitVec(matutils.sparse2full(vector, numFeatures))
         self.corpus[docNo] = vector
     
     self.corpus = numpy.asmatrix(self.corpus)
 def __getitem__(self, bow):
     """
     Return tf-idf representation of the input vector.
     """
     # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
     # as would the strict application of the IDF formula suggest
     vector = [(termId, tf * self.idfs.get(termId, 0.0))
               for termId, tf in bow if self.idfs.get(termId, 0.0) != 0.0]
     if self.normalize:
         vector = matutils.unitVec(vector)
     return vector
 def __getitem__(self, bow):
     """
     Return tf-idf representation of the input vector.
     """
     # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
     # as would the strict application of the IDF formula suggest
     vector = [(termId, tf * self.idfs.get(termId, 0.0)) 
               for termId, tf in bow if self.idfs.get(termId, 0.0) != 0.0]
     if self.normalize:
         vector = matutils.unitVec(vector)
     return vector
 def __getitem__(self, doc):
     # get similarities of doc to all documents in the corpus
     if self.normalize:
         doc = matutils.unitVec(doc)
     allSims = self.getSimilarities(doc)
     
     # return either all similarities as a list, or only self.numBest most similar, depending on settings from the constructor
     if self.numBest is None:
         return allSims
     else:
         tops = [(docNo, sim) for docNo, sim in enumerate(allSims) if sim > 0]
         tops = sorted(tops, key = lambda item: -item[1]) # sort by -sim => highest cossim first
         return tops[ : self.numBest] # return at most numBest top 2-tuples (docId, docSim)
Esempio n. 7
0
    def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitVec(vector)
        return vector
 def __getitem__(self, bow):
     """
     Return tf-idf representation of the input vector and/or corpus.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
     # as would the strict application of the IDF formula suggest
     vector = [(termId, tf * self.idfs.get(termId, 0.0)) 
               for termId, tf in bow if self.idfs.get(termId, 0.0) != 0.0]
     if self.normalize:
         vector = matutils.unitVec(vector)
     return vector
Esempio n. 9
0
    def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.isCorpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitVec(vector)
        return vector
Esempio n. 10
0
    def __getitem__(self, doc):
        # get similarities of doc to all documents in the corpus
        if self.normalize:
            doc = matutils.unitVec(doc)
        allSims = self.getSimilarities(doc)

        # return either all similarities as a list, or only self.numBest most similar, depending on settings from the constructor
        if self.numBest is None:
            return allSims
        else:
            tops = [(docNo, sim) for docNo, sim in enumerate(allSims)
                    if sim > 0]
            tops = sorted(tops, key=lambda item: -item[1]
                          )  # sort by -sim => highest cossim first
            return tops[:self.
                        numBest]  # return at most numBest top 2-tuples (docId, docSim)
Esempio n. 11
0
    def __init__(self,
                 corpus,
                 numBest=None,
                 dtype=numpy.float32,
                 numFeatures=None):
        """
        If `numBest` is left unspecified, similarity queries return a full list (one
        float for every document in the corpus, including the query document):

        If `numBest` is set, queries return `numBest` most similar documents, as a
        sorted list:

        >>> sms = MatrixSimilarity(corpus, numBest = 3)
        >>> sms[vec12]
        [(12, 1.0), (30, 0.95), (5, 0.45)]

        """
        if numFeatures is None:
            logging.info(
                "scanning corpus of %i documents to determine the number of features"
                % len(corpus))
            numFeatures = 1 + utils.getMaxId(corpus)

        logging.info("creating matrix for %i documents and %i features" %
                     (len(corpus), numFeatures))
        self.numFeatures = numFeatures
        self.numBest = numBest
        self.corpus = numpy.empty(shape=(len(corpus), numFeatures),
                                  dtype=dtype)
        self.normalize = True

        if corpus is not None:
            # iterate over corpus, populating the numpy matrix
            for docNo, vector in enumerate(corpus):
                if docNo % 1000 == 0:
                    logging.info("PROGRESS: at document #%i/%i" %
                                 (docNo, len(corpus)))
                vector = matutils.unitVec(
                    matutils.sparse2full(vector, numFeatures))
                self.corpus[docNo] = vector

        self.corpus = numpy.asmatrix(self.corpus)