def _initialise(self, corpus): #Four arrays need to be initialed #Topic-Vocabulary, Topic, Document-Topic, Document #change a little bit -- yr self.D, self.V = D, V = corpus.shape n_t = self.num_topic self._n_zw = np.zeros((n_t, V), dtype=np.intc) self._n_z = np.zeros((n_t), dtype=np.intc) self._n_dz = np.zeros((D, n_t), dtype=np.intc) self._n_d = np.zeros((D), dtype=np.intc) self._n = n = int(corpus.sum()) self.d_list, self.w_list = util.array2list(corpus) self.z_list = [] self.log_likelihood = np.zeros(self.num_iter, dtype=float) for i in range(n): d = self.d_list[i] w = self.w_list[i] #get topic assignment randomly z = np.random.randint(0, n_t, dtype=np.intc) self.z_list.append(z) self._n_zw[z][w] += 1 # increment topic-word count self._n_z[z] += 1 # increment topic-word sum self._n_dz[d][z] += 1 # increment doc-topic count self._n_d[d] += 1 # increment doc-topic sum
def compress(self): if self.ncols is None or len(self.data) == 0: return if self.compute_GP: t0 = time.time() G = np.random.randn(self.projsize, len(self.data)) / 100. A_flush = G * np.mat(self.data) dt = time.time() - t0 self.counters['numpy time (millisecs)'] += int(1000 * dt) # Add flushed update to local copy if self.A_curr == None: self.A_curr = A_flush else: self.A_curr += A_flush if self.compute_QR: t0 = time.time() R = self.QR() dt = time.time() - t0 self.counters['numpy time (millisecs)'] += int(1000 * dt) # reset data and re-initialize to R self.qr_data = [] for row in R: self.qr_data.append(util.array2list(row)) self.data = []
def compress(self): # Compute a QR factorization on the data accumulated so far. t0 = time.time() R = self.QR() dt = time.time() - t0 self.counters['numpy time (millisecs)'] += int(1000 * dt) # reset data and re-initialize to R self.data = [] self.A_data = [] for row in R: self.data.append(util.array2list(row))
def compress(self): # Compute a QR factorization on the data accumulated so far. if self.ncols == None or len(self.data) < self.ncols: return t0 = time.time() R = self.QR() dt = time.time() - t0 # reset data and re-initialize to R self.data = [] for row in R: self.data.append(util.array2list(row))
def compress(self): # Compute a QR factorization on the data accumulated so far. if self.ncols is None or len(self.data) < self.ncols: return t0 = time.time() R = self.QR() dt = time.time() - t0 self.counters['numpy time (millisecs)'] += int(1000 * dt) # reset data and re-initialize to R self.data = [] for row in R: self.data.append(util.array2list(row))
def close(self): self.counters['rows processed'] += self.nrows % 50000 self.compress() if self.compute_GP: if self.A_curr != None: for ind, row in enumerate(self.A_curr.getA()): yield ('GP', ind), util.array2list(row) if self.compute_QR: for i, row in enumerate(self.qr_data): key = np.random.randint(0, 4000000000) yield ('QR', key), row if self.compute_colnorms and self.colnorms != None: for ind, val in enumerate(self.colnorms): yield ('colnorms', ind), val
def output(self, final=False): if final or len(self.data) >= self.blocksize * self.ncols: self.counters['Blocks Output'] += 1 # compress the data if self.ncols is None: return t0 = time.time() A = numpy.array(self.data) U = self.compute_U(A) dt = time.time() - t0 self.counters['numpy time (millisecs)'] += int(1000 * dt) assert (U.shape[0] == len(self.keys)) for i, row in enumerate(U): yield self.keys[i], util.array2list(row) self.data = [] self.keys = []
def output(self,final=False): if final or len(self.data)>=self.blocksize*self.ncols: self.counters['Blocks Output'] += 1 # compress the data if self.ncols is None: return t0 = time.time() A = numpy.array(self.data) U = self.compute_U(A) dt = time.time() - t0 self.counters['numpy time (millisecs)'] += int(1000*dt) assert(U.shape[0] == len(self.keys)) for i,row in enumerate(U): yield self.keys[i], util.array2list(row) self.data = [] self.keys = []
def close(self): self.counters['rows processed'] += self.nrows % 50000 self.compress() if self.A_curr is not None: for ind, row in enumerate(self.A_curr.getA()): yield ind, util.array2list(row)