def predict_similarity(self, sa, sb): q = [[sa, sb, 0]] x1, mas1, x2, mas2, y2 = prepare_data(q, self.maxlen) ls = [] ls2 = [] use_noise.set_value(0.) for j in range(0, len(x1)): ls.append(embed(x1[j], 'en', W=self.W)) ls2.append(embed(x2[j], 'jp')) trconv = np.dstack(ls) trconv2 = np.dstack(ls2) emb2 = np.swapaxes(trconv2, 1, 2) emb1 = np.swapaxes(trconv, 1, 2) return self.f2sim(emb1, mas1, emb2, mas2)
def evaluate(self, data): x1, mas1, x2, mas2, y2 = prepare_data(data, self.maxlen) use_noise.set_value(0.) n_samples = len(data) ls = [] # Embedding results of xa ls2 = [] # Embedding results of xb for j in range(0, n_samples): ls.append(embed(x1[j], 'en', W=self.W)) ls2.append(embed(x2[j], 'jp')) # print "ls: (should be the same ref_embed)", ls rank_results = [] for i in range(0, n_samples): # NOTE: mas1 and mas2 are verticle matrix, not a normal one! # ref_ls refers to n_samples(999,EN) of duplicated ls[i] # So we can compare the ls[i](EN) with other sentences(999,JP) # to derive the ranking results for this given article ls[i](EN) # 用一个英语文章比较所有可能为pairs的日语文章(如999篇)求出ranking # ref_ls 就是一个重复了999(n_samples)次的文章ls[i] # 而 ls2 就是可能为paris的999篇日语的文章 ref_ls = [ls[i]] * n_samples # print "ref_embed", ref_embed ref_mas1 = np.array([ mas1[:, i], ] * n_samples).T # print "ref_mas", ref_mas # print "mas1", mas1 # return mas1, ref_mas trconv = np.dstack(ref_ls) trconv2 = np.dstack(ls2) emb2 = np.swapaxes(trconv2, 1, 2) emb1 = np.swapaxes(trconv, 1, 2) pred = self.f2sim(emb1, ref_mas1, emb2, mas2) rank = pd.Series(pred).rank(ascending=False)[i] rank_results.append(rank) print "the round", i, "rank:", rank return rank_results
def get_mse(self, data): # list saving the projection results (50 dim): x1, mas1, x2, mas2, y2 = prepare_data(data, self.maxlen) # print "Finish preparing the data!" use_noise.set_value(0.) n_samples = len(data) ls = [] # Embedding results of xa ls2 = [] # Embedding results of xb for j in range(0, n_samples): ls.append(embed(x1[j], 'en', W=self.W)) ls2.append(embed(x2[j], 'jp')) # print "Finished embedding,start projecting..." # start_time = time.time() # for i in range(0, n_samples): # print "conducting the", i, "projection" # loop_time = time.time() trconv = np.dstack(ls) trconv2 = np.dstack(ls2) emb1 = np.swapaxes(trconv, 1, 2) emb2 = np.swapaxes(trconv2, 1, 2) # list saving the projection results (50 dim): # list_projection1 = self.f_proj11(emb1, mas1) # list_projection2 = self.f_proj11(emb2, mas2) c = self.f_cost(emb1, mas1, emb2, mas2, y2) # After projection, compare the distance for possible pairs # ## SKIP return c
def chkterr2(self, mydata): # count = [] num = len(mydata) px = [] yx = [] use_noise.set_value(0.) for i in range(0, num, 256): q = [] x = i + 256 if x > num: x = num for j in range(i, x): q.append(mydata[j]) x1, mas1, x2, mas2, y2 = prepare_data(q, self.maxlen) ls = [] ls2 = [] for j in range(0, len(q)): ls.append(embed(x1[j], 'en', W=self.W)) ls2.append(embed(x2[j], 'jp')) trconv = np.dstack(ls) trconv2 = np.dstack(ls2) emb2 = np.swapaxes(trconv2, 1, 2) emb1 = np.swapaxes(trconv, 1, 2) pred = (self.f2sim(emb1, mas1, emb2, mas2)) * 4.0 + 1.0 #dm1=np.ones(mas1.shape,dtype=np.float32) #dm2=np.ones(mas2.shape,dtype=np.float32) #corr=f_cost(emb1,mas1,emb2,mas2,y2) for z in range(0, len(q)): yx.append(y2[z]) px.append(pred[z]) #count.append(corr) px = np.array(px) yx = np.array(yx) #print "average error= "+str(np.mean(acc)) return np.mean(np.square(px - yx)), meas.pearsonr( px, yx)[0], meas.spearmanr(yx, px)[0]
def train_lstm(self, train, max_epochs, correct, test_correct, batchsize=32): print "Training" print "the length of the training data is ", len(train) # test = train print "Batchsize =", batchsize print "max_epochs =", max_epochs lrate = 0.0001 # Learning rate, but Not USED ??? freq = 0 # ??? batchsize = 64 dfreq = 21 #display frequency self.mse = [] # MSE of train1 + train2 self.rank = [] self.tops = {} self.mse_test = [] # MSE of test1 self.mse_train = [] # MSE of train1 self.rank_test = [] self.tops_test = {} self.top_keys = [1, 5, 10] print "Before trianing, the error is:" # print self.chkterr2(train) # MSE check cst_all = self.chkterr2(train)[0] / 16 self.mse.append(cst_all) cst_test = self.chkterr2(test_correct)[0] / 16 self.mse_test.append(cst_test) cst_train = self.chkterr2(correct)[0] / 16 self.mse_train.append(cst_train) # 【注意】内存不足时使用chkterr2但是会慢,内存足够时使用 , self.get_mse(train) # 【注意】不要直接使用cst变量作为cost,因为这里的cst是最后一个batch的cost而已,不是全部的 print "Training error:", cst_all #, "==", self.get_mse(train) print "Training_correct error", cst_train print "Testing_correct error:", cst_test # Saving (Initialization) the ranking and top1,5,10 information (Trianing data) rank_results_train, n_tops = self.evaluate2( correct, tops=self.top_keys) # Similairty check # print "[debug]", n_tops for top_key in self.top_keys: # print "[debug]", n_tops[top_key] self.tops[top_key] = [] self.tops[top_key].append(n_tops[top_key]) print "top-", top_key, "=", self.tops[top_key], ":", n_tops[ top_key] print "Discription of evaluation (ranking) for training data:" print pd.Series(rank_results_train).describe() # Saving (Initialization) the ranking and top1,5,10 information (Testing data) rank_results_test, n_tops_test = self.evaluate2( test_correct, tops=self.top_keys) # Similairty check # print "[debug]", n_tops for top_key in self.top_keys: # print "[debug]", n_tops[top_key] self.tops_test[top_key] = [] self.tops_test[top_key].append(n_tops_test[top_key]) print "top-", top_key, "=", self.tops_test[ top_key], ":", n_tops_test[top_key] print "Discription of evaluation (ranking) for testing data:" print pd.Series(rank_results_test).describe() # eidx -> index of epoch for eidx in xrange(0, max_epochs): sta = time.time() print "" print 'Epoch', eidx, '...' num = len(train) # length of training data #---------------------Shuffle the data------------------------------# # 为何不直接用shuffle函数? # generates a list with length of num from the population xrange(num) # Used for shuffling the training data each time for each epoches # [5,2,6,.11,...] length -> len(train) rnd = random.sample(xrange(num), num) # i would be (0,32,64,...) # Iterate all batches for i in range(0, num, batchsize): q = [] x = i + batchsize if x > num: x = num # Shuffle data # Iterate samples inside each batch # i -> start index of the batch # x -> end index of the batch for z in range(i, x): # shuffling the training data to the list q q.append(train[rnd[z]]) #---------------------------------------------------------------------# """ Mask for LSTM is prepared by sentence module x1 = np.array([["我","很","好",",",",",","][...]...]) len(x1) => 文档的总数 mas1 = np.array([[1,1,1,0,0,0,0,0,0,0][...]...]) """ x1, mas1, x2, mas2, y2 = prepare_data(q, self.maxlen) ls = [] ls2 = [] freq += 1 use_noise.set_value(1.) for j in range(0, len(x1)): ls.append(embed(x1[j], 'en', W=self.W)) ls2.append(embed(x2[j], 'jp')) trconv = np.dstack(ls) trconv2 = np.dstack(ls2) emb2 = np.swapaxes(trconv2, 1, 2) emb1 = np.swapaxes(trconv, 1, 2) cst = self.f_grad_shared(emb2, mas2, emb1, mas1, y2) s = self.f_update(lrate) # Not USED ??? if np.mod(freq, dfreq) == 0: print 'Epoch ', eidx, 'Update ', freq, 'Cost ', cst # print 'Epoch ', eidx, 'Update ', freq, 'Cost ', cst # Evalution # print self.chkterr2(train) # MSE check cst_all = self.chkterr2(train)[0] / 16 self.mse.append(cst_all) cst_test = self.chkterr2(test_correct)[0] / 16 self.mse_test.append(cst_test) cst_train = self.chkterr2(correct)[0] / 16 self.mse_train.append(cst_train) # 【注意】内存不足时使用chkterr2但是会慢,内存足够时使用 , self.get_mse(train) # 【注意】不要直接使用cst变量作为cost,因为这里的cst是最后一个batch的cost而已,不是全部的 # 错误用法: print "Training error:", cst, "=", self.chkterr2(train)[0]/16, "==", self.get_mse(train) print "Training error:", cst_all #, "==", self.get_mse(train) print "Training_correct error", cst_train print "Testing_correct error:", cst_test # Saving the ranking and top1,5,10 information rank_results_train, n_tops = self.evaluate2( correct, tops=self.top_keys) # Similairty check self.rank.append(rank_results_train) for top_key in self.top_keys: self.tops[top_key].append(n_tops[top_key]) print "top-", top_key, "=", self.tops[top_key], ":", n_tops[ top_key] print "Discription of evaluation (ranking) for training data:" print pd.Series(rank_results_train).describe() # Saving the ranking and top1,5,10 information rank_results_test, n_tops_test = self.evaluate2( test_correct, tops=self.top_keys) # Similairty check self.rank_test.append(rank_results_test) for top_key in self.top_keys: self.tops_test[top_key].append(n_tops_test[top_key]) print "top-", top_key, "=", self.tops_test[ top_key], ":", n_tops_test[top_key] print "Discription of evaluation (ranking) for testing data:" print pd.Series(rank_results_test).describe() # Saving the present weights: self.save_model(name=self.model_name + "_" + str(eidx) + ".p") sto = time.time() self.time_saver = sto - sta print "epoch took:", self.time_saver