def getFull(self, length, idfs = None): """return document as BOW column vector. optionally scale each element by idfs""" bow = utils_dml.vect2bow(self.getTokenIds()) if (idfs == None): result = numpy.zeros(length, dtype = int) for tokenid, freq in bow.iteritems(): result[tokenid] = freq else: result = numpy.zeros(length, dtype = numpy.float32) for tokenid, freq in bow.iteritems(): result[tokenid] = freq * idfs[tokenid] return result
def getFull(self, length, idfs=None): """return document as BOW column vector. optionally scale each element by idfs""" bow = utils_dml.vect2bow(self.getTokenIds()) if (idfs == None): result = numpy.zeros(length, dtype=int) for tokenid, freq in bow.iteritems(): result[tokenid] = freq else: result = numpy.zeros(length, dtype=numpy.float32) for tokenid, freq in bow.iteritems(): result[tokenid] = freq * idfs[tokenid] return result
def getSparse(self, length, idfs = None): """return document as sparse BOW column vector (scipy.sparse.lil_matrix[tokenid,0]=frequency)""" bow = utils_dml.vect2bow(self.getTokenIds()) # lil_matrix access to element is O(log), so build a hash first to get O(1) if idfs == None: result = scipy.sparse.lil_matrix(shape = (1, length), dtype = int) for tokenid, freq in bow.iteritems(): result[0, tokenid] = freq else: result = scipy.sparse.lil_matrix(shape = (1, length), dtype = numpy.float32) for tokenid, freq in bow.iteritems(): result[0, tokenid] = freq * idfs[tokenid] return result
def getSparse(self, length, idfs=None): """return document as sparse BOW column vector (scipy.sparse.lil_matrix[tokenid,0]=frequency)""" bow = utils_dml.vect2bow( self.getTokenIds() ) # lil_matrix access to element is O(log), so build a hash first to get O(1) if idfs == None: result = scipy.sparse.lil_matrix(shape=(1, length), dtype=int) for tokenid, freq in bow.iteritems(): result[0, tokenid] = freq else: result = scipy.sparse.lil_matrix(shape=(1, length), dtype=numpy.float32) for tokenid, freq in bow.iteritems(): result[0, tokenid] = freq * idfs[tokenid] return result