def _initialize(self, dataE, dataW, dataToken): start = datetime.now() self.theta = probNormalize(np.random.random([self.E, self.K])) self.phi = probNormalize(np.random.random([self.K, self.V])) self.esp = [] self.z = [] z_dist = np.sum(self.theta, axis=0) / self.E for d in range(self.D): Nd = self.Nd[d] gamma = dataE[d] self.esp.append(multinomial(gamma, Nd)) self.z.append(multinomial(z_dist, Nd)) self.TE = np.zeros([self.K, self.E], dtype=np.int32) self.TV = np.zeros([self.K, self.V], dtype=np.int32) for d in range(self.D): docToken = dataToken[d] doc_z = self.z[d] doc_esp = self.esp[d] for n in range(self.Nd[d]): w = docToken[n] w_z = doc_z[n] w_esp = doc_esp[n] self.TE[w_z, w_esp] += 1 self.TV[w_z, w] += 1 self.TI = np.sum(self.TV, axis=1) self.IE = np.sum(self.TE, axis=0) duration = datetime.now() - start print "_initialize() takes %fs" % duration.total_seconds()
def _initialize(self, dataDUE, dataW, dataToken): start = datetime.now() self.theta = probNormalize(np.random.random([self.K])) self.pi = probNormalize(np.random.random([2])) self.eta = probNormalize( np.random.random([self.K, self.G, self.E]) + 0.1) self.phiB = probNormalize(np.random.random([self.V])) self.phiT = probNormalize(np.random.random([self.K, self.V])) self.psi = probNormalize(np.random.random([self.U, self.G])) self.z = np.zeros([self.D], dtype=np.int8) self.y = [] self.x = [] for d in range(self.D): self.z[d] = multinomial(self.theta) self.y.append(multinomial(self.pi, self.Nd[d])) ## time consuming, replaced with below ## # doc_x = [] # for m in range(self.Md[d]): # u = np.random.randint(0,self.U) # doc_x.append(multinomial(self.psi[u])) # self.x.append(np.array(doc_x, dtype=np.int8)) self.x.append(multinomial(self.psi[0], self.Md[d])) duration = datetime.now() - start self._log("_initialize() takes %fs" % duration.total_seconds())
def _initialize(self, dataDUE): start = datetime.now() self.theta = probNormalize(np.random.random([self.D, self.K])) self.phi = probNormalize(np.random.random([self.K, self.V])) self.eta = np.random.random([self.K, self.E]) self.z = [] for d in range(self.D): z_dist = self.theta[d] Nd = self.Nd[d] self.z.append(multinomial(z_dist, Nd)) self.eta_beta_inv = multivariateBeta_inv(self.eta) self.TI = np.zeros([self.D, self.K], dtype=np.int32) self.TV = np.zeros([self.K, self.V], dtype=np.int32) self.dataE_smoothed = {} for docdata in dataDUE.generate(): d, docToken, [doc_u, doc_e] = docdata doc_z = self.z[d] for n in range(self.Nd[d]): w = docToken[n] w_z = doc_z[n] self.TI[d, w_z] += 1 self.TV[w_z, w] += 1 doc_E = np.sum(np.identity(self.E, dtype=np.float64)[:, doc_e], axis=1) docE = probNormalize(doc_E + SMOOTH_FACTOR) self.dataE_smoothed[d] = docE duration = datetime.now() - start self._log("_initialize() takes %fs" % duration.total_seconds())
def _doc_z_update(self, d, doc_u, doc_e, docW, docToken): """ update document-level topic """ doc_z = self.z[d] doc_XE = self.DXE[d] doc_Y1V = self.DY1V.getrow(d).tocsr() doc_Y1V_array = doc_Y1V.toarray().squeeze() # calculate leave-one out statistics # TI_no_d, TXE_no_d, Y1TV_no_d = self.TI, self.TXE, self.Y1TV TI_no_d[doc_z] += -1 TXE_no_d[doc_z, :, :] += -doc_XE Y1TV_no_d[doc_z, :] += -doc_Y1V_array # conditional probability # prob_doc_z = self._prob_doc_z(TI_no_d, TXE_no_d, Y1TV_no_d, doc_XE, doc_Y1V) # new sampled result # doc_z_new = int(multinomial(prob_doc_z)) # update # self.z[d] = doc_z_new TI_no_d[doc_z_new] += 1 TXE_no_d[doc_z_new, :, :] += doc_XE Y1TV_no_d[doc_z_new, :] += doc_Y1V_array self.TI, self.TXE, self.Y1TV = TI_no_d, TXE_no_d, Y1TV_no_d
def _GibbsSamplingLocal(self, dataE, dataW, dataToken, epoch): """ Gibbs sampling word-level topic """ pbar = tqdm(range(self.D), total=self.D, desc='({0:^3})'.format(epoch)) for d in pbar: # sequentially sampling doc_Nd = self.Nd[d] docE = probNormalize(dataE[d] + SMOOTH_FACTOR) docToken = dataToken[d] for n in range(doc_Nd): w = docToken[n] w_z = self.z[d][n] ## sampling ## # calculate leave-one-out statistics # TI_no_dn, TV_no_dn = self.TI, self.TV TI_no_dn[d, w_z] += -1 TV_no_dn[w_z, w] += -1 # conditional probability # prob_pa = TI_no_dn[d] + self.alpha prob_pb = np.divide(TV_no_dn[:, w] + self.beta, np.sum(TV_no_dn + self.beta, axis=1)) prob_pc = np.multiply( self.eta_beta_inv, np.prod(np.power(docE, self.eta - 1), axis=1)) prob_w_z = probNormalize(prob_pa * prob_pb * prob_pc) # new sampled result # w_z_new = multinomial(prob_w_z) # update # self.z[d][n] = w_z_new TI_no_dn[d, w_z_new] += 1 TV_no_dn[w_z_new, w] += 1 self.TI, self.TV = TI_no_dn, TV_no_dn
def _initialize(self, dataE, dataW, dataToken): start = datetime.now() self.theta = probNormalize(np.random.random([self.D, self.K])) self.phi = probNormalize(np.random.random([self.K, self.V])) self.eta = np.random.random([self.K, self.E]) self.z = [] for d in range(self.D): z_dist = self.theta[d] Nd = self.Nd[d] self.z.append(multinomial(z_dist, Nd)) self.eta_beta_inv = multivariateBeta_inv(self.eta) self.TI = np.zeros([self.D, self.K], dtype=np.int32) self.TV = np.zeros([self.K, self.V], dtype=np.int32) for d in range(self.D): docToken = dataToken[d] doc_z = self.z[d] for n in range(self.Nd[d]): w = docToken[n] w_z = doc_z[n] self.TI[d, w_z] += 1 self.TV[w_z, w] += 1 duration = datetime.now() - start print "_initialize() takes %fs" % duration.total_seconds()
def _initialize(self, dataE, dataW, dataToken): start = datetime.now() self.theta = probNormalize(np.random.random([self.K])) self.pi = probNormalize(np.random.random([2])) self.eta = probNormalize(np.random.random([self.K, self.E])) self.phiB = probNormalize(np.random.random([self.V])) self.phiT = probNormalize(np.random.random([self.K, self.V])) self.z = np.zeros([self.D], dtype=np.int8) self.y = [] for d in range(self.D): self.z[d] = multinomial(self.theta) Nd = self.Nd[d] self.y.append(multinomial(self.pi, Nd)) duration = datetime.now() - start print "_initialize() takes %fs" % duration.total_seconds()
def _GibbsSamplingLocal(self, dataE, dataW, dataToken, epoch): """ Gibbs sampling word-level emotion and topic """ pbar = tqdm(range(self.D), total = self.D, desc='({0:^3})'.format(epoch)) for d in pbar: # sequentially sampling doc_Nd = self.Nd[d] docE = dataE[d] docToken = dataToken[d] for n in range(doc_Nd): w = docToken[n] w_z = self.z[d][n] w_esp = self.esp[d][n] ## sampling ## # calculate leave-one out statistics # TE_no_dn, TV_no_dn, TI_no_dn, IE_no_dn, = self.TE, self.TV, self.TI, self.IE TE_no_dn[w_z, w_esp] += -1 TV_no_dn[w_z, w] += -1 TI_no_dn[w_z] += -1 IE_no_dn[w_esp] += -1 # conditional probability # prob_w_esp = np.divide(np.multiply((self.alpha + TE_no_dn[w_z]), docE), (self.K * self.alpha + IE_no_dn)) prob_w_esp = probNormalize(prob_w_esp) prob_w_z = np.divide(np.multiply((self.alpha + TE_no_dn[:, w_esp]), (self.beta + TV_no_dn[:, w])), (self.V * self.beta + TI_no_dn)) prob_w_z = probNormalize(prob_w_z) # new sampled result # w_esp_new = multinomial(prob_w_esp) w_z_new = multinomial(prob_w_z) # update # self.z[d][n] = w_z_new self.esp[d][n] = w_esp_new TE_no_dn[w_z_new, w_esp_new] += 1 TV_no_dn[w_z_new, w] += 1 TI_no_dn[w_z_new] += 1 IE_no_dn[w_esp_new] += 1 self.TE, self.TV, self.TI, self.IE = TE_no_dn, TV_no_dn, TI_no_dn, IE_no_dn
def _GibbsSamplingLocal(self, dataDUE, epoch): """ Gibbs sampling word-level topic """ pbar = tqdm(dataDUE.generate(), total=self.D_train, desc='({0:^3})'.format(epoch)) for docdata in pbar: # sequentially sampling d, docToken, [doc_u, doc_e] = docdata doc_Nd = self.Nd[d] if d in self.dataE_smoothed: docE = self.dataE_smoothed[d] else: doc_E = np.sum(np.identity(self.E, dtype=np.float64)[:, doc_e], axis=1) docE = probNormalize(doc_E + SMOOTH_FACTOR) self.dataE_smoothed[d] = docE for n in range(doc_Nd): w = docToken[n] w_z = self.z[d][n] ## sampling ## # calculate leave-one-out statistics # TI_no_dn, TV_no_dn = self.TI, self.TV TI_no_dn[d, w_z] += -1 TV_no_dn[w_z, w] += -1 # conditional probability # prob_pa = TI_no_dn[d] + self.alpha prob_pb = np.divide(TV_no_dn[:, w] + self.beta, np.sum(TV_no_dn + self.beta, axis=1)) prob_pc = np.multiply( self.eta_beta_inv, np.prod(np.power(docE, self.eta - 1), axis=1)) prob_w_z = probNormalize(prob_pa * prob_pb * prob_pc) # new sampled result # w_z_new = multinomial(prob_w_z) # update # self.z[d][n] = w_z_new TI_no_dn[d, w_z_new] += 1 TV_no_dn[w_z_new, w] += 1 self.TI, self.TV = TI_no_dn, TV_no_dn
def _GibbsSamplingLocal(self, dataE, dataW, dataToken, epoch): """ Gibbs sampling word-level background-vs-topic and document-level topic """ pbar = tqdm(range(self.D), total = self.D, desc='({0:^3})'.format(epoch)) for d in pbar: # sequentially sampling doc_Nd = self.Nd[d] docE = dataE[d] docW = dataW[d] docToken = dataToken[d] doc_z = self.z[d] # intermediate parameters calculation # Y1T = np.sum(self.Y1TV, axis=1) for n in range(doc_Nd): w = docToken[n] w_y = self.y[d][n] ## sampling for y ## # calculate leave-one out statistics # YI_no_dn_y, Y0V_no_dn_y, Y1TV_no_dn_y = self.YI, self.Y0V, self.Y1TV Y1T_no_dn_y = Y1T YI_no_dn_y[w_y] += -1 if w_y == 0: Y0V_no_dn_y[w] += -1 elif w_y == 1: Y1TV_no_dn_y[doc_z, w] += -1 Y1T_no_dn_y[doc_z] += -1 self.DY1V[d, w] += -1 # delete w_y == 1 word else: raise ValueError("w_y not 0 or 1") # conditional probability # prob_w_y_unnorm = np.zeros([2],dtype=np.float32) prob_w_y_unnorm[0] = (self.delta + YI_no_dn_y[0]) * (self.beta + Y0V_no_dn_y[w]) / \ (self.V * self.beta + YI_no_dn_y[0]) prob_w_y_unnorm[1] = (self.delta + YI_no_dn_y[1]) * (self.beta + Y1TV_no_dn_y[doc_z, w]) / \ (self.V * self.beta + Y1T_no_dn_y[doc_z]) prob_w_y = probNormalize(prob_w_y_unnorm) # new sampled result # try: w_y_new = multinomial(prob_w_y) except ValueError, e: print prob_w_y_unnorm print prob_w_y print np.sum(prob_w_y), np.sum(prob_w_y) > 1.0 print YI_no_dn_y, self.YI, Y0V_no_dn_y[w], Y1TV_no_dn_y[doc_z,w], Y1T_no_dn_y[doc_z] print d raise e # update # self.y[d][n] = w_y_new YI_no_dn_y[w_y_new] += 1 if w_y_new == 0: Y0V_no_dn_y[w] += 1 elif w_y_new == 1: Y1TV_no_dn_y[doc_z, w] += 1 Y1T_no_dn_y[doc_z] += 1 self.DY1V[d, w] += 1 # add back word with w_y_new == 1 else: raise ValueError("w_y not 0 or 1") self.YI, self.Y0V, self.Y1TV = YI_no_dn_y, Y0V_no_dn_y, Y1TV_no_dn_y Y1T = Y1T_no_dn_y doc_Y1V = self.DY1V.getrow(d).tocsr() doc_Y1V_array = doc_Y1V.toarray().squeeze() ## sampling for z ## # calculate leave-one out statistics # TE_no_d_z, Y1TV_no_d_z, TI_no_d_z = self.TE, self.Y1TV, self.TI TE_no_d_z[doc_z,:] += -docE Y1TV_no_d_z[doc_z,:] += -doc_Y1V_array TI_no_d_z[doc_z] += -1 # conditional probability # prob_doc_z = self._prob_doc_z(TE_no_d_z, Y1TV_no_d_z, TI_no_d_z, docE, docW, doc_Y1V) # new sampled result # doc_z_new = multinomial(prob_doc_z) # update # self.z[d] = doc_z_new TE_no_d_z[doc_z_new,:] += docE Y1TV_no_d_z[doc_z_new, :] += doc_Y1V_array TI_no_d_z[doc_z_new] += 1 self.TE, self.Y1TV, self.TI = TE_no_d_z, Y1TV_no_d_z, TI_no_d_z