def get_datum_in_doc(self, doc, i): # Read the doc for i, if it's not already in memory # Get it out of the doc, return it as a column vector if self.current_doc != doc: self.current_doc = doc self.current_base_index = self.doc_index[doc, 0] self.doc_data = self.read_doc(doc) return asvector(self.doc_data[:, i-self.current_base_index])
def corpus_mean(reader): sum = np.zeros((reader.dim,), 'float64') for d in range(reader.num_docs): data = reader.read_doc(d) np.add(sum, data.sum(axis=1), out=sum) return asvector(sum / reader.num_data)