def __init__(self, corpus, numBest = None, dtype = numpy.float32): """ If numBest is left unspecified, similarity queries return a full list (one float for every document in the corpus, including the query document). If numBest is set, queries return numBest most similar documents, as a sorted list, eg. [(docIndex1, 1.0), (docIndex2, 0.95), ..., (docIndexnumBest, 0.45)]. """ logging.info("creating sparse matrix for %i documents" % len(corpus)) self.numBest = numBest self.corpus = scipy.sparse.lil_matrix((len(corpus), 1), dtype = dtype) # set no of columns to 1 for now, as the number of terms is unknown yet self.normalize = False # iterate over the corpus, populating the sparse matrix for docNo, vector in enumerate(corpus): if docNo % 10000 == 0: logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus))) vector = matutils.unitVec(vector) # make all vectors unit length, so that cosine similarity = simple dot product self.corpus.rows[docNo] = [termId for termId, _ in vector] self.corpus.data[docNo] = [dtype(val) for _, val in vector] # now set the shape properly, using no. columns = highest term index in the corpus + 1 numTerms = 1 + max(max(row + [-1]) for row in self.corpus.rows) # + [-1] to avoid exceptions from max(empty) self.corpus._shape = (len(corpus), numTerms) # convert to Compressed Sparse Row for efficient row slicing and multiplications self.corpus = self.corpus.tocsr() logging.info("created %s" % repr(self.corpus))
def __init__(self, corpus, numBest=None, dtype=numpy.float32): """ If numBest is left unspecified, similarity queries return a full list (one float for every document in the corpus, including the query document). If numBest is set, queries return numBest most similar documents, as a sorted list, eg. [(docIndex1, 1.0), (docIndex2, 0.95), ..., (docIndexnumBest, 0.45)]. """ logging.info("creating sparse matrix for %i documents" % len(corpus)) self.numBest = numBest self.corpus = scipy.sparse.lil_matrix( (len(corpus), 1), dtype=dtype ) # set no of columns to 1 for now, as the number of terms is unknown yet self.normalize = False # iterate over the corpus, populating the sparse matrix for docNo, vector in enumerate(corpus): if docNo % 10000 == 0: logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus))) vector = matutils.unitVec( vector ) # make all vectors unit length, so that cosine similarity = simple dot product self.corpus.rows[docNo] = [termId for termId, _ in vector] self.corpus.data[docNo] = [dtype(val) for _, val in vector] # now set the shape properly, using no. columns = highest term index in the corpus + 1 numTerms = 1 + max(max(row + [-1]) for row in self.corpus.rows ) # + [-1] to avoid exceptions from max(empty) self.corpus._shape = (len(corpus), numTerms) # convert to Compressed Sparse Row for efficient row slicing and multiplications self.corpus = self.corpus.tocsr() logging.info("created %s" % repr(self.corpus))
def __init__(self, corpus, numBest = None, dtype = numpy.float32, numFeatures = None): """ If `numBest` is left unspecified, similarity queries return a full list (one float for every document in the corpus, including the query document): If `numBest` is set, queries return `numBest` most similar documents, as a sorted list: >>> sms = MatrixSimilarity(corpus, numBest = 3) >>> sms[vec12] [(12, 1.0), (30, 0.95), (5, 0.45)] """ if numFeatures is None: logging.info("scanning corpus of %i documents to determine the number of features" % len(corpus)) numFeatures = 1 + utils.getMaxId(corpus) logging.info("creating matrix for %i documents and %i features" % (len(corpus), numFeatures)) self.numFeatures = numFeatures self.numBest = numBest self.corpus = numpy.empty(shape = (len(corpus), numFeatures), dtype = dtype, order = 'F') self.normalize = True # iterate over corpus, populating the numpy matrix for docNo, vector in enumerate(corpus): if docNo % 1000 == 0: logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus))) vector = matutils.unitVec(matutils.sparse2full(vector, numFeatures)) self.corpus[docNo] = vector self.corpus = numpy.asmatrix(self.corpus)
def __getitem__(self, bow): """ Return tf-idf representation of the input vector. """ # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as would the strict application of the IDF formula suggest vector = [(termId, tf * self.idfs.get(termId, 0.0)) for termId, tf in bow if self.idfs.get(termId, 0.0) != 0.0] if self.normalize: vector = matutils.unitVec(vector) return vector
def __getitem__(self, doc): # get similarities of doc to all documents in the corpus if self.normalize: doc = matutils.unitVec(doc) allSims = self.getSimilarities(doc) # return either all similarities as a list, or only self.numBest most similar, depending on settings from the constructor if self.numBest is None: return allSims else: tops = [(docNo, sim) for docNo, sim in enumerate(allSims) if sim > 0] tops = sorted(tops, key = lambda item: -item[1]) # sort by -sim => highest cossim first return tops[ : self.numBest] # return at most numBest top 2-tuples (docId, docSim)
def __getitem__(self, bow): """ Return log entropy representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.isCorpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr] if self.normalize: vector = matutils.unitVec(vector) return vector
def __getitem__(self, bow): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as would the strict application of the IDF formula suggest vector = [(termId, tf * self.idfs.get(termId, 0.0)) for termId, tf in bow if self.idfs.get(termId, 0.0) != 0.0] if self.normalize: vector = matutils.unitVec(vector) return vector
def __getitem__(self, doc): # get similarities of doc to all documents in the corpus if self.normalize: doc = matutils.unitVec(doc) allSims = self.getSimilarities(doc) # return either all similarities as a list, or only self.numBest most similar, depending on settings from the constructor if self.numBest is None: return allSims else: tops = [(docNo, sim) for docNo, sim in enumerate(allSims) if sim > 0] tops = sorted(tops, key=lambda item: -item[1] ) # sort by -sim => highest cossim first return tops[:self. numBest] # return at most numBest top 2-tuples (docId, docSim)
def __init__(self, corpus, numBest=None, dtype=numpy.float32, numFeatures=None): """ If `numBest` is left unspecified, similarity queries return a full list (one float for every document in the corpus, including the query document): If `numBest` is set, queries return `numBest` most similar documents, as a sorted list: >>> sms = MatrixSimilarity(corpus, numBest = 3) >>> sms[vec12] [(12, 1.0), (30, 0.95), (5, 0.45)] """ if numFeatures is None: logging.info( "scanning corpus of %i documents to determine the number of features" % len(corpus)) numFeatures = 1 + utils.getMaxId(corpus) logging.info("creating matrix for %i documents and %i features" % (len(corpus), numFeatures)) self.numFeatures = numFeatures self.numBest = numBest self.corpus = numpy.empty(shape=(len(corpus), numFeatures), dtype=dtype) self.normalize = True if corpus is not None: # iterate over corpus, populating the numpy matrix for docNo, vector in enumerate(corpus): if docNo % 1000 == 0: logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus))) vector = matutils.unitVec( matutils.sparse2full(vector, numFeatures)) self.corpus[docNo] = vector self.corpus = numpy.asmatrix(self.corpus)