def train_model(self): for epoch in range(self.num_epochs): # Generate training instances user_input, item_input_pos, item_input_neg = data_gen._get_pairwise_all_data( self.dataset) total_loss = 0.0 training_start_time = time() num_training_instances = len(user_input) for num_batch in np.arange( int(num_training_instances / self.batch_size)): bat_users,bat_items_pos,bat_items_neg =\ data_gen._get_pairwise_batch_data(user_input,\ item_input_pos, item_input_neg, num_batch, self.batch_size) feed_dict = { self.users: bat_users, self.pos_items: bat_items_pos, self.node_dropout: [0.1], self.mess_dropout: [0.1], self.neg_items: bat_items_neg } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss print("[iter %d : loss : %f, time: %f]" % (epoch + 1, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def train_model(self): for epoch in range(self.num_epochs): training_start_time = time() print('solving for user vectors...') for userid in range(self.num_users): feed = { self.user_id: [userid], self.Pu: self.Pui[userid].T.reshape([-1, 1]), self.Cu: self.Cui[userid].T.reshape([-1, 1]) } self.sess.run(self.update_user, feed_dict=feed) print('solving for item vectors...') for itemid in range(self.num_items): feed = { self.item_id: [itemid], self.Pi: self.Pui[:, itemid].reshape([-1, 1]), self.Ci: self.Cui[:, itemid].reshape([-1, 1]) } self.sess.run(self.update_item, feed_dict=feed) print('iteration %i finished in %f seconds' % (epoch + 1, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def train_model(self): gen_batch_index = np.arange(self.num_users) np.random.shuffle(gen_batch_index) dis_batch_index = np.arange(self.num_users) np.random.shuffle(dis_batch_index) totalEpochs = self.epochs totalEpochs = int(totalEpochs / self.step_G) for epoch in range(totalEpochs): train_matrix, ZR_matrix, PM_matrix = self.get_train_data() # training discriminator for d_epoch in range(self.step_D): for idx in np.arange(0, self.num_users, step=self.batchSize_D): idx = dis_batch_index[idx:idx + self.batchSize_D] train_data = train_matrix[idx].toarray() train_mask = PM_matrix[idx].toarray() feed = {self.realData: train_data, self.mask: train_mask, self.condition: train_data} self.sess.run(self.trainer_D, feed_dict=feed) # training generator for g_epoch in range(self.step_G): for idx in np.arange(0, self.num_users, step=self.batchSize_G): idx = dis_batch_index[idx:idx + self.batchSize_G] train_data = train_matrix[idx].toarray() train_z_mask = ZR_matrix[idx].toarray() train_p_mask = PM_matrix[idx].toarray() feed = {self.realData: train_data, self.condition: train_data, self.mask: train_p_mask, self.G_ZR_dims: train_z_mask} self.sess.run(self.trainer_G, feed_dict=feed) self.eval_rating_matrix() Evaluate.test_model(self, self.dataset)
def train_model(self): for epoch in range(self.num_epochs): # Generate training instances user_input, item_input_pos, item_input_social, item_input_neg, suk_input = self._get_pairwise_all_data( ) total_loss = 0.0 training_start_time = time() num_training_instances = len(user_input) for num_batch in np.arange( int(num_training_instances / self.batch_size)): num_training_instances = len(user_input) id_start = num_batch * self.batch_size id_end = (num_batch + 1) * self.batch_size if id_end > num_training_instances: id_end = num_training_instances bat_users = user_input[id_start:id_end] bat_items_pos = item_input_pos[id_start:id_end] bat_items_social = item_input_social[id_start:id_end] bat_items_neg = item_input_neg[id_start:id_end] bat_suk_input = suk_input[id_start:id_end] feed_dict = {self.user_input:bat_users,self.item_input:bat_items_pos,\ self.item_input_social:bat_items_social,\ self.item_input_neg:bat_items_neg,self.suk:bat_suk_input} loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss print("[iter %d : loss : %f, time: %f]" % (epoch + 1, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def train_model(self): for epoch in range(self.num_epochs): # Generate training instances user_input, item_input, lables = self._get_input_all_data() total_loss = 0.0 training_start_time = time() num_training_instances = len(user_input) for num_batch in np.arange(int(num_training_instances / self.batch_size)): num_training_instances = len(user_input) id_start = num_batch * self.batch_size id_end = (num_batch + 1) * self.batch_size if id_end > num_training_instances: id_end = num_training_instances bat_users = user_input[id_start:id_end].tolist() bat_items = item_input[id_start:id_end].tolist() bat_lables = np.array(lables[id_start:id_end]) feed_dict = {self.one_hot_u: bat_users, self.one_hot_v: bat_items, self.lables: bat_lables} loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss print("[iter %d : loss : %f, time: %f]" % ( epoch + 1, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def train_model(self): for epoch in range(self.num_epochs): # Generate training instances if self.ispairwise.lower() == "true": user_input, item_input_pos, item_input_neg = data_gen._get_pairwise_all_data(self.dataset) else: user_input, item_input, lables = data_gen._get_pointwise_all_data(self.dataset, self.num_negatives) total_loss = 0.0 training_start_time = time() num_training_instances = len(user_input) for num_batch in np.arange(int(num_training_instances / self.batch_size)): if self.ispairwise.lower() == "true": bat_users, bat_items_pos, bat_items_neg = \ data_gen._get_pairwise_batch_data(user_input, item_input_pos, item_input_neg, num_batch, self.batch_size) feed_dict = {self.user_input: bat_users, self.item_input: bat_items_pos, self.item_input_neg: bat_items_neg} else: bat_users, bat_items, bat_lables = \ data_gen._get_pointwise_batch_data(user_input, item_input, lables, num_batch, self.batch_size) feed_dict = {self.user_input: bat_users, self.item_input: bat_items, self.lables: bat_lables} loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss print("[iter %d : loss : %f, time: %f]" % ( epoch + 1, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def train_model(self): for epoch in range(self.num_epochs): batches = self.shuffle() num_batch = len(batches[1]) batch_index = np.arange(num_batch) training_start_time = time() total_loss = 0.0 for index in batch_index: user_input, num_idx, item_input, labels = self.batch_gen( batches, index) feed_dict = { self.user_input: user_input, self.num_idx: num_idx, self.item_input: item_input, self.labels: labels, self.is_train_phase: True } loss, _ = self.sess.run([self.loss, self.optimizer], feed_dict) total_loss += loss print("[iter %d : loss : %f, time: %f]" % (epoch + 1, total_loss / num_batch, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def train_model(self): for epoch in range(self.num_epochs): # Generate training instances user_input, item_input, item_input_recents, lables = data_gen._get_pointwise_all_highorder_data( self.dataset, self.high_order, self.num_negatives) num_training_instances = len(user_input) total_loss = 0.0 training_start_time = time() for num_batch in np.arange( int(num_training_instances / self.batch_size)): bat_users, bat_items, bat_items_recents, bat_lables =\ data_gen._get_pointwise_batch_seqdata(user_input, \ item_input,item_input_recents, lables, num_batch, self.batch_size) feed_dict = { self.user_input: bat_users, self.item_input: bat_items, self.item_input_recents: bat_items_recents, self.lables: bat_lables } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss print("[iter %d : loss : %f, time: %f]" % (epoch + 1, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def train_model(self): for _ in range(self.epochs): for _ in range(self.d_epoch): users_list, items_list, labels_list = self.get_train_data() self.training_discriminator(users_list, items_list, labels_list) for _ in range(self.g_epoch): self.training_generator() Evaluate.test_model(self, self.dataset)
def run(self, data_filename, do_eval=None, output_dir=None): """Run the two-steps ExreadCluster algorithm (no edge information). Parameter: row_header (String[]): column names rows (String[[]]): array of string arrays. Output: data2rep (int array): cluster assignment for each data r_emb (K * emb_size): representative embedding emb_tensor (N * emb_size): fine-tuned input embedding cls_loss (float): clustering loss """ accuracies = [] # Get data. row_header, rows = FileLoader(data_filename) target_rows, aux_rows, gold, spans = self.get_basics(row_header, rows) # Generate batches. row_iter, aux_c_sizes, aux_weights = DataLoader(target_rows, aux_rows, self.batch_size, self.w2v_model) # Get the initial embedding tensor as the average w2v embedding. emb_tensor = self.w2v_model.get_emb_matrix(target_rows) if do_eval is not None: with open(os.path.join(output_dir, "emb_before"), "wb") as handle: pickle.dump((emb_tensor, spans, gold), handle, protocol=pickle.HIGHEST_PROTOCOL) # Run the base cluster module print("Run 1st module: clustering algorithm") c_emb, labels, cls_loss = self.cluster_module.run(emb_tensor) # Run the embedding fine-tuning module print("Run 2nd module: refine embedding") enc, emb_loss, emb_labels = self.emb_module.run(aux_c_sizes, row_iter, LabelLoader(labels, self.batch_size), c_emb, spans, gold, output_dir, aux_weights=aux_weights) print("****cluster loss: %f; emb loss:%f****" % (cls_loss, emb_loss)) # Update embedding tensor emb_tensor = enc.data if do_eval is not None: accuracies.append(Evaluate(labels, gold, do_eval)) print("Run 3rd module: refinement by clustering algorithm") # Final refinement c_emb, labels, cls_loss = self.cluster_module.run(emb_tensor) labels = self.post_processing(row_header, rows, labels) if do_eval is not None: accuracies.append(Evaluate(labels, gold, do_eval)) with open(os.path.join(output_dir, "emb_after"), "wb") as handle: pickle.dump((emb_tensor, spans, gold), handle, protocol=pickle.HIGHEST_PROTOCOL) return row_header, rows, labels, c_emb, emb_tensor, cls_loss, accuracies
def train_model(self): update_count = 0.0 # the total number of gradient updates for annealing # largest annealing parameter for epoch in range(self.num_epochs): random_perm_doc_idx = np.random.permutation(self.num_users) self.total_batch = self.num_users total_loss = 0.0 training_start_time = time() num_training_instances = self.num_users for num_batch in np.arange( int(num_training_instances / self.batch_size)): if num_batch == self.total_batch - 1: batch_set_idx = random_perm_doc_idx[num_batch * self.batch_size:] elif num_batch < self.total_batch - 1: batch_set_idx = random_perm_doc_idx[num_batch * self.batch_size: (num_batch + 1) * self.batch_size] batch_matrix = np.zeros((len(batch_set_idx), self.num_items)) if self.total_anneal_steps > 0: anneal = min(self.anneal_cap, 1. * update_count / self.total_anneal_steps) else: anneal = self.anneal_cap batch_uid = 0 trainDict = self.dataset.trainDict for userid in batch_set_idx: items_by_userid = trainDict[userid] for itemid in items_by_userid: batch_matrix[batch_uid, itemid] = 1 batch_uid = batch_uid + 1 feed_dict = feed_dict = { self.input_ph: batch_matrix, self.keep_prob_ph: 0.5, self.anneal_ph: anneal, self.is_training_ph: 1 } _, loss = self.sess.run([self.optimizer, self.loss], feed_dict=feed_dict) total_loss += loss update_count += 1 print("[iter %d : loss : %f, time: %f]" % (epoch + 1, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def evaluate(self, verbose=0): evalu = Evaluate(self.model_names, self.X_train, self.y_preds, self.config['evaluate'], verbose=verbose) evalu.fit() self.metrics = evalu.metrics self.boundary_points = evalu.boundary_points # self.y_preds[model] = clustering(self.models[model], self.X_train, adjust_label = adjust_label, verbose=verbose)
def __init__(self,args,data,ckpt_path): #seq_len,xvocab_size, label_size,ckpt_path,pos_size,type_size,data self.opt = args self.num_steps = 120 self.num_class = 2 self.word_num = data.word_size self.ckpt_path=ckpt_path self.pos_size=data.pos_size self.type_size=data.type_size self.util= Util() sys.stdout.write('Building Graph ') self._build_model(args,embedding_matrix=data.pretrained) sys.stdout.write('graph built\n') self.eval=Evaluate()
def test(self, data, labels): predictions = [] cnt_correct = 0 for idx, doc in enumerate(data): _, prediction = self.predict(doc) predictions.append(prediction) true_label = labels[idx] if true_label == prediction: cnt_correct += 1 eval = Evaluate(predictions, labels, self.num_of_cls) metrics = {'macro':eval.calc_macro_metrics(), 'micro':eval.calc_micro_metrics()} return predictions, metrics
def make_table(ordered_algorithms, evaluation_functions): result = PrettyTable() result.add_column('Algorithm', []) for func_name in [Evaluate.str_mean(name, rank) for name, rank in evaluation_functions]: result.add_column(func_name, []) for info in ordered_algorithms: result.add_row([info[1].get_name()] + [x for x in info[0]]) return result
def train_model(self): for epoch in range(self.num_epochs): # Generate training instances mask_corruption_np = np.random.binomial( 1, 1 - self.corruption_level, (self.num_users, self.num_items)) random_perm_doc_idx = np.random.permutation(self.num_users) self.total_batch = self.num_users total_loss = 0.0 training_start_time = time() num_training_instances = self.num_users for num_batch in np.arange( int(num_training_instances / self.batch_size)): if num_batch == self.total_batch - 1: batch_set_idx = random_perm_doc_idx[num_batch * self.batch_size:] elif num_batch < self.total_batch - 1: batch_set_idx = random_perm_doc_idx[num_batch * self.batch_size: (num_batch + 1) * self.batch_size] batch_matrix = np.zeros((len(batch_set_idx), self.num_items)) batch_uid = 0 trainDict = self.dataset.trainDict for userid in batch_set_idx: items_by_userid = trainDict[userid] for itemid in items_by_userid: batch_matrix[batch_uid, itemid] = 1 batch_uid = batch_uid + 1 feed_dict = feed_dict={self.mask_corruption: mask_corruption_np[batch_set_idx, :],\ self.input_R: batch_matrix} _, loss = self.sess.run([self.optimizer, self.loss], feed_dict=feed_dict) total_loss += loss print("[iter %d : loss : %f, time: %f]" % (epoch + 1, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: Evaluate.test_model(self, self.dataset)
def train_model(self): for epoch in range(self.num_epochs): random_row_idx = np.random.permutation(self.num_users) # randomly permute the rows random_col_idx = np.random.permutation(self.num_items) # randomly permute the cols training_start_time = time() total_loss = 0.0 for i in range(self.num_batch_U): # iterate each batch if i == self.num_batch_U - 1: row_idx = random_row_idx[i * self.batch_size:] else: row_idx = random_row_idx[(i * self.batch_size):((i + 1) * self.batch_size)] for j in range(self.num_batch_I): # get the indices of the current batch if j == self.num_batch_I - 1: col_idx = random_col_idx[j * self.batch_size:] else: col_idx = random_col_idx[(j * self.batch_size):((j + 1) * self.batch_size)] p_input, n_input = self.pairwise_neg_sampling(row_idx, col_idx) input_tmp = self.train_R[row_idx, :] input_tmp = input_tmp[:, col_idx] input_R_U = self.train_R[row_idx, :] input_R_I = self.train_R[:, col_idx] _, loss = self.sess.run( # do the optimization by the minibatch [self.optimizer, self.cost], feed_dict={ self.input_R_U: input_R_U, self.input_R_I: input_R_I, self.input_OH_I: self.I_OH_mat[col_idx, :], self.input_P_cor: p_input, self.input_N_cor: n_input, self.row_idx: np.reshape(row_idx, (len(row_idx), 1)), self.col_idx: np.reshape(col_idx, (len(col_idx), 1))}) total_loss+=loss print("[iter %d : total_loss : %f, time: %f]" %(epoch+1,total_loss,time()-training_start_time)) if epoch %self.verbose == 0: self.eval_rating_matrix() Evaluate.test_model(self,self.dataset)
def run_evaluation(dataset_path, predictors, additional_roots=None, max_number_of_queries=None, folds_num=5, evaluation_functions=(('precision', 1), ('precision', 3), ('precision', 5), ('ndcg', 1), ('ndcg', 3), ('ndcg', 5), ('dcg', 1), ('dcg', 3), ('dcg', 5))): evaluation_results = [np.zeros(len(evaluation_functions)) for i in range(len(predictors))] for fold in load.load_dataset(dataset_path, additional_roots, max_number_of_queries, folds_num): (x_train, y_train, id_train), (x_test, y_test, id_test) = fold for index_predictor, predictor in enumerate(predictors): # sys.stderr.write(predictor.get_name() + '\n') # sys.stderr.flush() y_pred = predictor.learn_predict(x_train, y_train, x_test) for index_function, (func_type, rank) in enumerate(evaluation_functions): evaluation_results[index_predictor][index_function] += Evaluate.mean(func_type, rank, y_test, y_pred, id_test) evaluation_results = [result / folds_num for result in evaluation_results] return evaluation_results
def algorithm_evaluation(): rd = ReadData(500000, 1000, 100) (sparse_ratings, books_used, deleted_users) = rd.load_ratings_data() # Obtain the filled matrix using iterative singular value thresholding and view mse per iteration a = Algorithms(sparse_ratings) (ratings_with_nan, filled_ratings_isvt) = a.isvt() e = Evaluate(5, 10) mse_isvt = e.performance_eval_isvt(ratings_with_nan, filled_ratings_isvt) # Obtain the filled matrix using non-negative matrix factorization and view mse per iteration filled_ratings_nmf = a.nmf() mse_nmf = e.performance_eval_nmf(sparse_ratings, filled_ratings_nmf) # Vary hold-out set and find average mse for each algorithm hos = [5, 10, 15, 20] e_isvt = [] e_nmf = [] for i in hos: e2 = Evaluate(5, i) mse_isvt = e2.performance_eval_isvt(ratings_with_nan, filled_ratings_isvt, plot=False) e_isvt.append(mse_isvt) mse_nmf = e2.performance_eval_nmf(sparse_ratings, filled_ratings_nmf, plot=False) e_nmf.append(mse_nmf) plt.plot(hos, e_isvt, label="Soft Impute") plt.plot(hos, e_nmf, label="NMF") plt.xlabel("Hold-Out Set %") plt.ylabel("Mean Squared Error") plt.legend() plt.show()
def clusteringDCT(pred_true_txt_ind_prevPreds, wordVectorsDic, batchDocs, maxPredLabel): print("#m-stream-cleaned") Evaluate(pred_true_txt_ind_prevPreds) pred_true_text_ind_prevPreds_to_cluster, pred_true_text_ind_prevPreds_to_not_cluster = extrcatLargeClusterItems( pred_true_txt_ind_prevPreds) print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3])) print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4])) '''minPredToC, maxPredToC, minTrueToC, maxTrueToC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_cluster) print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_cluster)") print(minPredToC, maxPredToC, minTrueToC, maxTrueToC) minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_not_cluster) print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_not_cluster)") print(minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC)''' all_pred_clusters = len(groupTxtByClass(pred_true_txt_ind_prevPreds, False)) pred_clusters = len( groupTxtByClass(pred_true_text_ind_prevPreds_to_cluster, False)) non_pred_clusters = len( groupTxtByClass(pred_true_text_ind_prevPreds_to_not_cluster, False)) print("#clusters=" + str(pred_clusters)) print("#not clusters=" + str(non_pred_clusters)) print("this clustering with embedding DCT") pred_clusters = non_pred_clusters - pred_clusters print("#update clusters=" + str(pred_clusters)) nparr = np.array(pred_true_text_ind_prevPreds_to_cluster) print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3])) print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4])) preds = list(nparr[:, 0]) trues = list(nparr[:, 1]) texts = list(nparr[:, 2]) inds = list(nparr[:, 3]) prevPreds = list(nparr[:, 4]) skStopWords = getScikitLearn_StopWords() texts = processTextsRemoveStopWordTokenized(texts, skStopWords) '''dicDocFreq=getDocFreq(texts) dctCoffs=1 X=generate_sent_vecs_toktextdata_DCT(texts, wordVectorsDic, 300,dctCoffs) #vectorizer = TfidfVectorizer(tokenizer=stem_text,max_df=0.5,min_df=1) #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english') #X = vectorizer.fit_transform(texts)''' '''svd = TruncatedSVD(50) #svd = PCA(n_components=50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) #X=X.toarray() X = lsa.fit_transform(X)''' '''km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100,random_state=0) km.fit(X) list_km_pred_true_text=combine_pred_true_txt_from_list(km.labels_, trues, texts) print("#k-means") Evaluate(list_km_pred_true_text)''' '''ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text=combine_pred_true_txt_from_list(ward.labels_, trues, texts) print("#hr-ward-DCT") print(min(ward.labels_), max(ward.labels_)) pred_true_text_ind_prevPreds_to_not_cluster_hr=change_pred_label(pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters+1) Evaluate(list_hr_pred_true_text) Evaluate(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr) ''' X = generate_sent_vecs_toktextdata(texts, wordVectorsDic, 300) ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X) list_hr_pred_true_text_ind_prevPred = np.column_stack( (ward.labels_, trues, texts, inds, prevPreds)).tolist() print("#hr-ward-AVG") pred_true_text_ind_prevPreds_to_not_cluster_hr = change_pred_label( pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1) Evaluate(list_hr_pred_true_text_ind_prevPred) Evaluate(list_hr_pred_true_text_ind_prevPred + pred_true_text_ind_prevPreds_to_not_cluster_hr) #print_by_group(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr) print("#spectral-avg") clustering = SpectralClustering(n_clusters=pred_clusters, assign_labels="discretize", random_state=0).fit(X) list_sp_pred_true_text_ind_prevPred = np.column_stack( (clustering.labels_, trues, texts, inds, prevPreds)).tolist() pred_true_text_ind_prevPreds_to_not_cluster_spec = change_pred_label( pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1) Evaluate(list_sp_pred_true_text_ind_prevPred) Evaluate(list_sp_pred_true_text_ind_prevPred + pred_true_text_ind_prevPreds_to_not_cluster_spec)
def randomization_test(self, labels, y1, y2, epoch=1000): import random e = Evaluate(labels, y1, self.num_of_cls) f_1 = e.calc_micro_metrics()['f1'] e = Evaluate(labels, y2, self.num_of_cls) f_2 = e.calc_micro_metrics()['f1'] s = abs(f_1-f_2) cnt = 0 for i in range(0, epoch): temp_y1 = [] temp_y2 = [] for idx in range(len(labels)): if random.uniform(0, 1) > 0.5: temp_y1.append(y2[idx]) temp_y2.append(y1[idx]) else: temp_y1.append(y1[idx]) temp_y2.append(y2[idx]) e = Evaluate(labels, temp_y1, self.num_of_cls) f_1 = e.calc_micro_metrics()['f1'] e = Evaluate(labels, temp_y2, self.num_of_cls) f_2 = e.calc_micro_metrics()['f1'] s_prime = abs(f_1-f_2) if s_prime > s: cnt += 1 p_value = (cnt+1)/(epoch+1) return p_value
class BiLstm(object): def __init__(self,args,data,ckpt_path): #seq_len,xvocab_size, label_size,ckpt_path,pos_size,type_size,data self.opt = args self.num_steps = 120 self.num_class = 2 self.word_num = data.word_size self.ckpt_path=ckpt_path self.pos_size=data.pos_size self.type_size=data.type_size self.util= Util() sys.stdout.write('Building Graph ') self._build_model(args,embedding_matrix=data.pretrained) sys.stdout.write('graph built\n') self.eval=Evaluate() def _build_model(self,flags,embedding_matrix): tf.reset_default_graph() tf.set_random_seed(123) self.input=tf.placeholder(shape=[None,self.num_steps], dtype=tf.int64) self.length = tf.placeholder(shape=[None,], dtype=tf.int64) self.pos=tf.placeholder(shape=[None,self.num_steps], dtype=tf.int64) self.type=tf.placeholder(shape=[None,self.num_steps], dtype=tf.int64) self.target = [tf.placeholder(shape=[None, ], dtype=tf.int64, name='li_{}'.format(t)) for t in range(self.num_steps)] self.weight = [tf.placeholder(shape=[None, ], dtype=tf.float32, name='wi_{}'.format(t)) for t in range(self.num_steps)] self.keep_prob = tf.placeholder(tf.float32) # drop out if embedding_matrix is not None: self.embedding = tf.Variable(embedding_matrix, trainable=True, name="emb",dtype=tf.float32)# else: self.embedding = tf.get_variable("emb", [self.word_num, self.emb_dim]) self.inputs_emb = tf.nn.embedding_lookup(self.embedding, self.input) if flags.use_tree: pos_embedding = tf.get_variable('pos_embed', [self.pos_size, 40], dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=1e-4)) type_embedding = tf.get_variable('type_embed', [self.type_size, 40], dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=1e-4)) pos_inputs = tf.nn.embedding_lookup(pos_embedding, self.pos) type_inputs = tf.nn.embedding_lookup(type_embedding, self.type) self.inputs_emb = tf.concat(2, [self.inputs_emb, pos_inputs,type_inputs]) cell = tf.nn.rnn_cell.LSTMCell(num_units=flags.hidden_size, state_is_tuple=True) dropout_cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=self.keep_prob) stacked_cell= tf.nn.rnn_cell.MultiRNNCell([dropout_cell] * self.opt.num_layers, state_is_tuple=True) outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=stacked_cell,cell_bw=stacked_cell,dtype=tf.float32,sequence_length=self.length,inputs=self.inputs_emb) output_fw, output_bw = outputs output= tf.concat(2, [output_fw,output_bw]) soft_dim=self.opt.hidden_size*2 self.softmax_w = tf.get_variable("softmax_w", [soft_dim, self.num_class]) self.softmax_b = tf.get_variable("softmax_b", [self.num_class]) output=tf.reshape(output,[-1,soft_dim]) self.logits = tf.matmul(output, self.softmax_w) + self.softmax_b self.decode_outputs_test = tf.nn.softmax(self.logits) self.decode_outputs_test=tf.reshape(self.decode_outputs_test,[-1,self.num_steps,self.num_class]) #states_fw, states_bw = states self.classify_out=tf.reshape(self.logits,[-1,self.num_steps,self.num_class]) self.logits= tf.transpose(self.classify_out, [1, 0, 2]) self.logits=tf.unpack(self.logits,axis=0) self.loss = tf.nn.seq2seq.sequence_loss(self.logits, self.target, self.weight, self.num_class) self.train_op = tf.train.AdamOptimizer(learning_rate=self.opt.learn_rate).minimize(self.loss) '''Training and Evaluation''' def train(self, data, sess=None): saver = tf.train.Saver() if not sess: sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=2)) # create a session sess.run(tf.global_variables_initializer()) # init all variables sys.stdout.write('\n Training started ...\n') best_loss=100 best_epoch=0 t1=time.time() for i in range(self.opt.epochs): try: loss,_=self.run_epoch(sess,data,data.train,True) val_loss,pred= self.run_epoch(sess, data,data.valid,False) t2=time.time() print('epoch:%2d \t time:%.2f\tloss:%f\tvalid_loss:%f'%(i,t2-t1,loss,val_loss)) t1=time.time() if val_loss<best_loss: saver.save(sess, self.ckpt_path + self.opt.model_name + '.ckpt') best_loss=val_loss best_epoch=i sys.stdout.flush() except KeyboardInterrupt: # this will most definitely happen, so handle it print('Interrupted by user at iteration {}'.format(i)) self.session = sess return sess print('best valid accuary:%f\tbest epoch:%d'%(best_loss,best_epoch)) # prediction def predict(self, data, sess): _, predicts = self.run_epoch(sess, data, data.test, False) if self.opt.use_ilp: pred = self.ilp_solution(predicts , data.test['weight'], data.test['length'], data.test['dfather'], data.test['dtype']) else: pred= np.argmax(predicts, axis=2) acc, f1, pratio, gratio=self.eval.values(pred,data.test['target'],data.test['weight']) print('accuary:%f,f1:%f,pratio:%f,gratio:%f' %(acc,f1,pratio,gratio)) def run_epoch(self, sess, data,data_type,is_train): losses = [] num_batch=data.gen_batch_num(data_type) predicts=None for i in range(num_batch): input, target, weight, length, pos, dtype,dfather,sent,compressed=data.gen_batch(data_type, i) if is_train: feed_dict = self.get_feed(input, target, weight, length, pos, dtype, keep_prob=0.8) _, loss_v, predict = sess.run([self.train_op, self.loss, self.decode_outputs_test], feed_dict) else: feed_dict = self.get_feed(input, target, weight, length, pos, dtype, keep_prob=1.) loss_v, predict= sess.run([self.loss, self.decode_outputs_test], feed_dict) losses.append(loss_v) if predicts is None: predicts = predict else: predicts = np.concatenate((predicts, predict)) return np.mean(losses),predicts def ilp_solution(self,predict,batchW,batchL,fathers,types): Myilp = Ilp() pred_label = predict[:, :, 1] pred = [] batchW_temp = np.array(batchW,copy=True) for j in range(pred_label.shape[0]): size = sum(batchW_temp[j] == 1) # curr_label = pred_label[j][:size] curr_fathers = [int(f) for f in fathers[j]] curr_fathers.insert(0, 0) dep_length = self.util.caculate_length(curr_fathers) dep_length=dep_length[1:] curr_types = types[j][:] saved_types = self.util.get_typelist(curr_fathers, curr_types) _, retained, values = Myilp.solve_ilp_problem(size, curr_label, dep_length=dep_length, parents=curr_fathers,saved=saved_types) values.extend([0] * (120 - len(values))) pred.append(values) pred = np.array(pred) return pred def restore_last_session(self): saver = tf.train.Saver() sess = tf.Session() # create a session saver.restore(sess, self.ckpt_path + self.opt.model_name + '.ckpt') print('model restored') return sess def get_feed(self, input, target, weight, length, pos, dtype, keep_prob): feed_dict={self.input:input} feed_dict.update({self.target[t]: target[t] for t in range(self.num_steps)}) feed_dict.update({self.weight[t]: weight[t] for t in range(self.num_steps)}) feed_dict[self.pos]=pos feed_dict[self.type]=dtype feed_dict[self.length]=length feed_dict[self.keep_prob] = keep_prob # dropout prob return feed_dict
def cluster_biterm(f, list_pred_true_words_index, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterId_Freq={}, dic_biterm__allClusterFreq={}): print("cluster_bigram") current_txt_id = last_txtId eval_pred_treu_txt = [] line_count = 0 t11 = datetime.now() for item in list_pred_true_words_index: words = item[2] bi_terms = construct_biterms(words) current_txt_id += 1 line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) #X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) #text_Vec=X[0] text_Vec = [0] * embedDim clusterId = findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec) max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)]) dic_clus__id[clusterId] = max_c_id txtId_txt[current_txt_id] = words c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq = populateClusterFeature( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) '''if line_count%1000==0: c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)''' eval_pred_treu_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne == True: if str(item[1]) != '-1': f.write( str(clusterId) + " " + str(item[1]) + " " + str(item[2]) + "\n") else: f.write( str(clusterId) + " " + str(item[1]) + " " + str(item[2]) + "\n") if line_count % 500 == 0: #print(dic_clus__id) print(len(dic_clus__id)) #delete old and small clusters, remove multi-cluster words from clusters list_c_sizes = [] list_c_ids = [] #list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) #list_size__cid[len(txtIds)]=c_id mean_c_size = statistics.mean(list_c_sizes) std_c_size = statistics.stdev(list_c_sizes) mean_c_id = statistics.mean(list_c_ids) std_c_id = statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids = [] del_count = 0 '''for c_id, txtIds in c_txtIds.items(): c_size= len(txtIds) ##print('c_id=', c_id, 'c_size=', c_size) #if c_size<=2 :#or del_count<15: # list_del_cids.append(c_id) # print('delete cluster=',c_id, '#size=', c_size) #del_count+=1 #if c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size or float(c_size)>=mean_c_size: #if float(c_size)<float(abs(mean_c_size)): # list_del_cids.append(c_id) #print('delete cluster=',c_id, '#size=', c_size) #float(c_id)<=float(abs(mean_c_id-std_c_id)) if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))) or float(c_size)>=mean_c_size: #and del_count<100: list_del_cids.append(c_id) del_count+=1 # print('delete cluster=',c_id, '#size=', c_size) #list_c_sizes.sort(reverse=True) #for c_size in list_c_sizes[0:20]: # list_del_cids.append(list_size__cid[c_size])''' for c_id, orderId in dic_clus__id.items(): if c_id not in c_txtIds: continue c_size = len(c_txtIds[c_id]) #if (float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))): #if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size*1): if (float(c_id) <= float(abs(mean_c_id - std_c_id)) or float(orderId) <= float(abs(mean_c_id - std_c_id)) ) and (c_size <= 1 or float(c_size) <= float( abs(mean_c_size - std_c_size)) or float(c_size) >= mean_c_size + std_c_size): list_del_cids.append(c_id) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms = [] for c_id in list_del_cids: del c_bitermsFreqs[c_id] del c_totalBiterms[c_id] del c_txtIds[c_id] del c_wordsFreqs[c_id] del c_totalWords[c_id] del dic_clus__id[c_id] #del c_clusterVecs[c_id] '''for biterm, dic_clusterId__Freq in dic_biterm__clusterId_Freq.items(): if c_id in dic_biterm__clusterId_Freq[biterm]: bitermClusterIdFreq=dic_biterm__clusterId_Freq[biterm][c_id] #dic_biterm__clusterId_Freq[biterm][c_id]=0 dic_biterm__allClusterFreq[biterm]-=bitermClusterIdFreq listTargetBiterms.append(biterm) del dic_biterm__clusterId_Freq[biterm][c_id]''' '''listTargetBiterms=set(listTargetBiterms) for biterm in listTargetBiterms: if dic_biterm__allClusterFreq[biterm]<=0: del dic_biterm__clusterId_Freq[biterm] del dic_biterm__allClusterFreq[biterm]''' #c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) if line_count % 1000 == 0: print('#######-personal-eval_pred_treu_txt', len(eval_pred_treu_txt)) Evaluate(eval_pred_treu_txt, ignoreMinusOne) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) last_txtId = current_txt_id return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq ]
c_txtIds = {} c_clusterVecs = {} txtId_txt = {} last_txtId = 0 max_c_id = 0 dic_clus__id = {} dic_biterm__clusterId_Freq = {} dic_biterm__allClusterFreq = {} f = open(resultFile, 'w') t11 = datetime.now() c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq = cluster_biterm( f, list_pred_true_words_index, c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, max_c_id, wordVectorsDic, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) f.close() listtuple_pred_true_text = ReadPredTrueText(resultFile, ignoreMinusOne) print('result for', dataset) Evaluate(listtuple_pred_true_text)
class Inference(object): def __init__(self, data, model, model_params, model_params_grad, savedir, num_obs_samples, num_future_steps, num_mc_samples, ppc_window, z_true=None, true_model_params=None, iters=1000): self.data = data self.dim = self.data[1].size(2) self.T = self.data[1].size(0) self.model_params = model_params self.train_data = self.data[0:2] self.y_future = self.data[4] self.x_future = self.data[5] self.y_complete = self.data[6] self.num_future_steps = self.y_future.shape[0] self.model = model self.savedir = savedir self.num_obs_samples = num_obs_samples self.num_future_steps = num_future_steps self.num_mc_samples = 1 self.model_params_grad = model_params_grad self.true_model_params = true_model_params self.vi = MeanFieldVI(self.model, self.savedir, self.num_mc_samples) self.ppc_window = ppc_window self.isPPC = False init = 'map' # 'true' self.init_z = self.map_estimate() if init == 'map': self.var_params = self.vi.init_var_params(self.T, self.dim, self.init_z, grad=True) elif init == 'true': self.var_params = self.vi.init_var_params(self.T, self.dim, z_true, grad=True) else: print 'specify valid init option.' self.iters = iters self.opt_params = { 'var_mu': self.var_params[0], 'var_log_scale': self.var_params[1] } for k, v in self.model_params_grad.items(): if v == True: self.opt_params[k] = self.model_params[k] # self.var_params_model = self.vi.init_var_params_model() # self.opt_params['model_mu'] = self.var_params_model[0] # self.opt_params['model_log_scale'] = self.var_params_model[1] self.test = self.data[2] if self.test is None: self.ev = None self.num_train = self.data[0].shape[0] else: self.ev = Evaluate(self.data, self.model, savedir='', num_obs_samples=self.num_obs_samples) self.num_test = self.data[2].shape[0] self.num_train = self.data[0].shape[0] - self.num_test def unpack_data(self, data): y = data[0] x = data[1] return y, x def map_estimate(self): # initialize to all ones = smooth. z = torch.tensor(torch.rand(self.T, self.dim, dtype=dtype, device=device), requires_grad=True, dtype=dtype, device=device) y, x = self.unpack_data(self.data) self.map_iters = 100 self.opt_params = [z] #self.map_optimizer = torch.optim.Adam(self.opt_params, lr=1e-3) self.map_optimizer = torch.optim.LBFGS(self.opt_params) lbfgs = True for t in range(self.map_iters): def closure(): self.map_optimizer.zero_grad() output = -self.model.log_joint(self.model_params, y, x, z) output.backward() return output if lbfgs: self.map_optimizer.step(closure) with torch.no_grad(): output = -self.model.log_joint(self.model_params, y, x, z) else: output = -self.model.log_joint(self.model_params, y, x, z) self.map_optimizer.zero_grad() output.backward() self.map_optimizer.step() if t % 5 == 0: print t, output.item() if t % 5 == 0: plt.cla() plt.plot(to_numpy(z)) figure = plt.gcf() # get current figure figure.set_size_inches(8, 6) plt.savefig(self.savedir + '/plots/curr_map_z.png') return self.opt_params[0].clone().detach() def run(self): self.optimizer = torch.optim.SGD(self.opt_params.values(), momentum=0.99, lr=1e-10) # .99, 1e-6 #self.optimizer = torch.optim.Adam(self.opt_params.values(), lr=1e-3) # .99, 1e-6 #return self.optimize(50000, False, 1000) return self.optimize(20000, False, 1000) def optimize(self, iters, lbfgs, print_every): y, x = self.train_data[0], self.train_data[1] print 'optimizing...' outputs = [] clip = 5. curr_model_params = {} for k, v in self.model_params_grad.items(): if v == True: curr_model_params[k] = [] self.iters = iters for t in range(self.iters): #torch.nn.utils.clip_grad_norm(self.opt_params.values(), clip) self.optimizer.zero_grad() output = -self.vi.forward(self.model_params, self.train_data, self.var_params, t) #/ float(self.num_train) #output = -self.vi.forward_with_model_param_post(self.model_params, self.train_data, self.opt_params, t) #/ float(self.num_train) outputs.append((output.item() / float(self.num_train))) output.backward() self.optimizer.step() for k, v in curr_model_params.items(): if k in self.opt_params: curr_model_params[k].append( [el.item() for el in self.opt_params[k].flatten()]) if t % print_every == 0: # printing ox = output.item() / float(self.num_train) print 'iter: ', t, 'loss: %.2f ' % ox, 'scale: ', if 'var_log_scale' in self.opt_params: print torch.mean(self.opt_params['var_log_scale'].clone(). detach()).item(), if 'model_mu' in self.opt_params: print self.opt_params['model_mu'].item( ), self.opt_params['model_log_scale'].item() for k, v in self.model_params_grad.items(): if v == True: if k in self.opt_params: for el in self.opt_params[k].flatten(): print k, '%.3f ' % el.item(), print '\n' if self.ev is not None: test_marginal = self.ev.valid_loss(self.opt_params) #y_future, future_trajectories, avg_future_marginal_lh = self.ev.sample_future_trajectory(self.opt_params, self.num_future_steps) train_acc, test_acc = self.ev.accuracy(self.opt_params) print 'train acc: %.3f ' % train_acc.item(), 'test acc: %.3f ' % test_acc.item(), \ 'test marginal likelihood: %.3f ' % test_marginal#, 'future marginal lh: %.3f' % avg_future_marginal_lh.item() # plotting plt.cla() plt.plot(outputs) figure = plt.gcf() # get current figure figure.set_size_inches(8, 6) plt.savefig(self.savedir + '/loss.png') plt.cla() for k, v in curr_model_params.items(): plt.cla() if k == 'beta': plt.plot(sigmoid(np.array(v))) else: plt.plot(v) # if self.true_model_params: # for el in self.true_model_params[k]: # plt.axhline(y=el, color='r', linestyle='-') figure = plt.gcf() # get current figure figure.set_size_inches(8, 6) plt.savefig(self.savedir + '/plots/' + k + '.png') zx = self.var_params[0] zx = to_numpy(zx) zx_scale = np.exp(to_numpy(self.var_params[1])) plt.cla() labels = [ 'Bias', 'X1', 'X2', 'Choice t-1', 'RW Side t-1', 'X1 t-1', 'X2 t-1' ] for j in range(zx_scale.shape[1]): #plt.plot(zx[:,j], label=labels[j], linewidth=.5) plt.plot(zx[:, j], linewidth=.5) # plt.fill_between(np.arange(zx.shape[0]), zx[:,j] - zx_scale[:,j], zx[:,j] + zx_scale[:,j]) figure = plt.gcf() # get current figure figure.set_size_inches(12, 8) # plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.savefig(self.savedir + '/plots/curr_est_z.png') test_inds = self.data[-4].cpu().numpy() if self.ev is not None: zx_test = zx[test_inds] plt.cla() for j in range(zx_scale.shape[1]): #plt.plot(zx_test[:,j], label=labels[j], linewidth=.5) plt.plot(zx_test[:, j], linewidth=.5) plt.savefig(self.savedir + '/plots/curr_est_test_z.png') if self.ev is not None: test_marginal = self.ev.valid_loss(self.opt_params) np.savetxt(self.savedir + '/test_marginal.txt', np.array([test_marginal.item()])) print 'final test marginal: ', test_marginal.item() # detach and clone all params for k in self.opt_params.keys(): self.opt_params[k] = self.opt_params[k].clone().detach() # access learning and regularization components #learning, regularization = self.model.log_prior_relative_contrib(self.var_params[0], y, x) #torch.save(learning.clone().detach(), self.savedir+'/model_structs/learning_after_training.pth') #torch.save(regularization.clone().detach(), self.savedir+'/model_structs/regularization_after_training.pth') #plt.cla() #plt.plot(to_numpy(learning.clone().detach())) #plt.savefig(self.savedir+'/plots/learning_after_training.png') #plt.cla() #plt.plot(to_numpy(regularization.clone().detach())) #plt.savefig(self.savedir+'/plots/regularization_after_training.png') return self.opt_params
def main(train_file_to_use, test_file_to_use, comp_file_to_use, test_type, features_combination_list, number_of_iter, comp, train_index=None, test_index=None, best_weights_list=None): # start all combination of features for features_combination in features_combination_list: # Create features for train and test gold trees print('{}: Start creating parser model for features : {}'.format(time.asctime(time.localtime(time.time())), features_combination)) logging.info('{}: Start creating parser model for features : {}'.format(time.asctime(time.localtime(time.time())), features_combination)) train_start_time = time.time() parser_model_obj = ParserModel(directory, train_file_to_use, test_file_to_use, comp_file_to_use, features_combination, use_edges_existed_on_train, use_pos_edges_existed_on_train, train_index=train_index, test_index=test_index) model_finish_time = time.time() model_run_time = (model_finish_time - train_start_time) / 60.0 print('{}: Finish creating parser model for features : {} in {} minutes'. format(time.asctime(time.localtime(time.time())), features_combination, model_run_time)) logging.info('{}: Finish creating parser model for features : {} in {} minutes' .format(time.asctime(time.localtime(time.time())), features_combination, model_run_time)) # Run perceptron to learn the best weights print('{}: Start Perceptron for features : {} and number of iterations: {}'. format(time.asctime(time.localtime(time.time())), features_combination, number_of_iter)) logging.info('{}: Start Perceptron for features : {} and number of iterations: {}'. format(time.asctime(time.localtime(time.time())), features_combination, number_of_iter)) perceptron_obj = StructPerceptron(model=parser_model_obj, directory=directory, feature_combination=features_combination) weights = perceptron_obj.perceptron(num_of_iter=number_of_iter) # weights = None # old_loc = "C:\\Users\\RomG\\PycharmProjects\\NLP_HW2\\output\\stepwise_no_27_23_01_2018_14_06_52\\weights\\30_28_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_9_8_7_3_2_1" # with open(os.path.join(old_loc, "final_weight_vec_20.pkl"), 'rb') as f: # weights = pickle.load(f) # perceptron_obj.directory = old_loc # perceptron_obj.inference_mode('test', weights) train_run_time = (time.time() - model_finish_time) / 60.0 print('{}: Finish Perceptron for features : {} and num_of_iter: {}. run time: {} minutes'. format(time.asctime(time.localtime(time.time())), features_combination, number_of_iter, train_run_time)) logging.info('{}: Finish Perceptron for features : {} and num_of_iter: {}. run time: {} minutes'. format(time.asctime(time.localtime(time.time())), features_combination, number_of_iter, train_run_time)) evaluate_obj = Evaluate(parser_model_obj, perceptron_obj, directory) best_weights_name = str() if test_type != 'comp': weights_directory = perceptron_obj.directory weight_file_names = [f for f in listdir(weights_directory) if isfile(join(weights_directory, f))] accuracy = dict() mistakes_dict_names = dict() for weights in weight_file_names: with open(os.path.join(weights_directory, weights), 'rb') as fp: weight_vec = pickle.load(fp) weights = weights[:-4] if train_index is not None and weights != 'final_weight_vec_{}'.format(number_of_iter): continue accuracy[weights], mistakes_dict_names[weights] = evaluate_obj.calculate_accuracy(weight_vec, weights, test_type) print('{}: The model hyper parameters and results are: \n num_of_iter: {} \n test file: {} \n' 'train file: {} \n test type: {} \n features combination list: {} \n accuracy: {:%} \n' 'mistakes dict name: {}' .format(time.asctime(time.localtime(time.time())), number_of_iter, test_file_to_use, train_file_to_use, test_type, features_combination_list, accuracy[weights], mistakes_dict_names[weights])) logging.info('{}: The model hyper parameters and results are: \n num_of_iter: {} \n test file: {}' '\n train file: {} \n test type: {} \n features combination list: {} \n accuracy: {} \n' 'mistakes dict name: {}' .format(time.asctime(time.localtime(time.time())), number_of_iter, test_file_to_use, train_file_to_use, test_type, features_combination_list, accuracy[weights], mistakes_dict_names[weights])) # get the weights that gave the best accuracy and save as best weights best_weights = max(accuracy, key=accuracy.get) with open(os.path.join(weights_directory, best_weights + '.pkl'), 'rb') as fp: best_weights_vec = pickle.load(fp) best_weights_name = os.path.join(weights_directory, "best_weights_" + best_weights + '.pkl') with open(best_weights_name, 'wb') as f: pickle.dump(best_weights_vec, f) if train_index is not None: # running CV return accuracy['final_weight_vec_{}'.format(number_of_iter)] logging.info('{}: best weights for {}, {}, {}, with accuracy {}, name is: {} ' .format(time.asctime(time.localtime(time.time())), num_of_iter, test_type, features_combination_list, accuracy[best_weights],best_weights_name)) print('{}: best weights for {}, {}, {}, with accuracy {}, name is: {} ' .format(time.asctime(time.localtime(time.time())), num_of_iter, test_type, features_combination_list, accuracy[best_weights], best_weights_name)) if comp: for best_weights_vec_loaded in best_weights_list: inference_file_name = evaluate_obj.infer(best_weights_vec_loaded, test_type) print('{}: The inferred file name is: {} for weights: {} '.format(time.asctime(time.localtime (time.time())), inference_file_name, best_weights_vec_loaded)) logging.info('{}: The inferred file name is: {} for weights: {} '.format(time.asctime( time.localtime(time.time())), inference_file_name, best_weights_vec_loaded)) logging.info('-----------------------------------------------------------------------------------') return
def cluster_biterm_framework( f, list_CPost, c_CFVector, max_c_id, dic_txtId__CPost, wordVectorsDic, dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram, oCSimilarityFlgas, c_itemsCount): eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for oCPost in list_CPost: trueLabel = oCPost.trueLabel tagWords = oCPost.tagWords titleWords = oCPost.titleWords bodyWords = oCPost.bodyWords id = oCPost.id soPostId = oCPost.soPostId createtime = oCPost.createtime print('id', id, 'tagWords', tagWords, 'titleWords', titleWords, 'bodyWords', bodyWords) txtBitermsFreqs_Tag = None bi_terms_len_Tag = 0 grams_Tag = None txtBitermsFreqs_Title = None bi_terms_len_Title = 0 grams_Title = None txtBitermsFreqs_Body = None bi_terms_len_Body = 0 grams_Body = None text_VecTag = None text_VecTitle = None text_VecBody = None targetClusterIds = [] dic_txtId__CPost[id] = oCPost if oCSimilarityFlgas.isTagSim: bi_termsTag = construct_biterms(tagWords) grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram) for gram in grams_Tag: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Tag = Counter(bi_termsTag) bi_terms_len_Tag = len(bi_termsTag) tCIds = findTargetClusters(txtBitermsFreqs_Tag, dic_bitermTag__clusterIds) # print('dic_bitermTag__clusterIds', dic_bitermTag__clusterIds, 'txtBitermsFreqs_Tag', txtBitermsFreqs_Tag) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic, embedDim) text_VecTag = X[0] if oCSimilarityFlgas.isTitleSim: bi_termsTitle = construct_biterms(titleWords) grams_Title = generateGramsConsucetive(titleWords, min_gram, max_gram) for gram in grams_Title: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Title = Counter(bi_termsTitle) bi_terms_len_Title = len(bi_termsTitle) tCIds = findTargetClusters(txtBitermsFreqs_Title, dic_bitermTitle__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([titleWords], wordVectorsDic, embedDim) text_VecTitle = X[0] if oCSimilarityFlgas.isBodySim: bi_termsBody = construct_biterms(bodyWords) grams_Body = generateGramsConsucetive(bodyWords, min_gram, max_gram) for gram in grams_Body: if gram in dic_ngram__txtIds and len( set(dic_ngram__txtIds[gram])) > max_cposts: continue dic_ngram__txtIds.setdefault(gram, []).append(id) txtBitermsFreqs_Body = Counter(bi_termsBody) bi_terms_len_Body = len(bi_termsBody) tCIds = findTargetClusters(txtBitermsFreqs_Body, dic_bitermBody__clusterIds) targetClusterIds.extend(tCIds) if isSemantic: X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic, embedDim) text_VecBody = X[0] oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag, txtBitermsFreqs_Title, bi_terms_len_Title, txtBitermsFreqs_Body, bi_terms_len_Body, text_VecTag, text_VecTitle, text_VecBody) targetClusterIds = set(targetClusterIds) clusterId = findCloseClusterByTargetClusters_framework( c_CFVector, oCPostProcessed, targetClusterIds, max_c_id, oCSimilarityFlgas) if ignoreMinusOne: if str(trueLabel) != '-1': f.write( str(clusterId) + " " + str(trueLabel) + " " + ' '.join(tagWords) + " " + str(soPostId) + "\n") else: f.write( str(clusterId) + " " + str(trueLabel) + " " + ' '.join(tagWords) + " " + str(soPostId) + "\n") eval_pred_true_txt.append([clusterId, trueLabel, tagWords]) if clusterId not in c_itemsCount: c_itemsCount[clusterId] = 0 c_itemsCount[clusterId] += 1 max_c_id = max([max_c_id, clusterId, len(c_CFVector)]) dic_clus__id[clusterId] = max_c_id # print('max_c_id, len(c_CFVector)', max_c_id, len(c_CFVector)) c_CFVector, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds = populateClusterFeature_framework( c_CFVector, oCPostProcessed, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, clusterId, id, oCSimilarityFlgas) del oCPostProcessed del oCPost line_count += 1 if line_count % DeleteInterval == 0: c_CFVector, c_itemsCount = deleteOldClusters_framework( c_CFVector, c_itemsCount, dic_clus__id) if line_count % 1000 == 0: # print('c_itemsCount', c_itemsCount) Evaluate(eval_pred_true_txt, ignoreMinusOne) return [ c_CFVector, max_c_id, dic_txtId__CPost, dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, dic_ngram__txtIds, c_itemsCount ]
print(len(sub_list_pred_true_words_index)) #cluster_sd(sub_list_pred_true_words_index) dic_bitri_keys_selectedClusters_seenBatch=cluster_gram_freq(sub_list_pred_true_words_index, batchNo, dic_bitri_keys_selectedClusters_seenBatch, list_pred_true_words_index[0:end]) predsSeen_list_pred_true_words_index=evaluateByGram(dic_bitri_keys_selectedClusters_seenBatch, list_pred_true_words_index[0:end]) not_clustered_inds_batch=extractSeenNotClustered(predsSeen_list_pred_true_words_index, sub_list_pred_true_words_index) #not_clustered_inds_seen_batch.extend(not_clustered_inds_batch) #not_clustered_inds_batch=assignToClusterSimDistribution(not_clustered_inds_batch, dic_bitri_keys_selectedClusters_seenBatch, list_pred_true_words_index[0:end], wordVectorsDic) globalList_clustered.extend(predsSeen_list_pred_true_words_index) globalList_not_clustered.extend(not_clustered_inds_batch) Evaluate(predsSeen_list_pred_true_words_index) #+not_clustered_inds_batch) print("total texts=", len(predsSeen_list_pred_true_words_index)+len(not_clustered_inds_batch)) #texts in cluster + texts not in cluster should be =2000 '''dictri_keys_selectedClusters_currentBatch, dicbi_keys_selectedClusters_currentBatch, not_clustered_inds_currentBatch, dic_combined_keys_selectedClusters, new_sub_list_pred_true_words_index=filterClusters(dictri_keys_selectedClusters_currentBatch, dicbi_keys_selectedClusters_currentBatch, sub_list_pred_true_words_index, list_pred_true_words_index[0:end]) not_clustered_inds_seen_batch.extend(not_clustered_inds_currentBatch) appendResultFile(new_sub_list_pred_true_words_index, fileName) if batchNo>=1: # and batchNo%2==0: dic_preds, new_not_clustered_inds_seen_batch=assignToClusterBySimilarity(not_clustered_inds_seen_batch, list_pred_true_words_index[0:end], dic_combined_keys_selectedClusters, wordVectorsDic) #appendResultFile(new_not_clustered_inds_seen_batch, fileName)
def __init__(self, data, model, model_params, model_params_grad, savedir, num_obs_samples, num_future_steps, num_mc_samples, ppc_window, z_true=None, true_model_params=None, iters=1000): self.data = data self.dim = self.data[1].size(2) self.T = self.data[1].size(0) self.model_params = model_params self.train_data = self.data[0:2] self.y_future = self.data[4] self.x_future = self.data[5] self.y_complete = self.data[6] self.num_future_steps = self.y_future.shape[0] self.model = model self.savedir = savedir self.num_obs_samples = num_obs_samples self.num_future_steps = num_future_steps self.num_mc_samples = 1 self.model_params_grad = model_params_grad self.true_model_params = true_model_params self.vi = MeanFieldVI(self.model, self.savedir, self.num_mc_samples) self.ppc_window = ppc_window self.isPPC = False init = 'map' # 'true' self.init_z = self.map_estimate() if init == 'map': self.var_params = self.vi.init_var_params(self.T, self.dim, self.init_z, grad=True) elif init == 'true': self.var_params = self.vi.init_var_params(self.T, self.dim, z_true, grad=True) else: print 'specify valid init option.' self.iters = iters self.opt_params = { 'var_mu': self.var_params[0], 'var_log_scale': self.var_params[1] } for k, v in self.model_params_grad.items(): if v == True: self.opt_params[k] = self.model_params[k] # self.var_params_model = self.vi.init_var_params_model() # self.opt_params['model_mu'] = self.var_params_model[0] # self.opt_params['model_log_scale'] = self.var_params_model[1] self.test = self.data[2] if self.test is None: self.ev = None self.num_train = self.data[0].shape[0] else: self.ev = Evaluate(self.data, self.model, savedir='', num_obs_samples=self.num_obs_samples) self.num_test = self.data[2].shape[0] self.num_train = self.data[0].shape[0] - self.num_test
def cluster_biterm(f, list_pred_true_words_index_postid_createtime, c_bitermsFreqs={}, c_totalBiterms={}, c_wordsFreqs={}, c_totalWords={}, c_txtIds={}, c_clusterVecs={}, txtId_txt={}, last_txtId=0, max_c_id=0, wordVectorsDic={}, dic_clus__id={}, dic_biterm__clusterId_Freq={}, dic_biterm__allClusterFreq={}, dic_biterm__clusterIds={}, c_textItems={}, dic_ngram__textItems={}, min_gram=1, max_gram=2, isTagSim=True, isTitleSim=False, isBodySim=False): print("cluster_bigram") # current_txt_id=last_txtId eval_pred_true_txt = [] line_count = 0 t11 = datetime.now() for item in list_pred_true_words_index_postid_createtime: words = item[2] current_txt_id = int(item[3]) postId = item[4] bi_terms = construct_biterms(words) grams = generateGramsConsucetive(words, min_gram, max_gram) # bi_terms=generateGramsConsucetive(words,minGSize, maxGSize) # print(words, bi_terms) for gram in grams: dic_ngram__textItems.setdefault(gram, []).append(item) line_count += 1 txtBitermsFreqs = Counter(bi_terms) bi_terms_len = len(bi_terms) txtWordsFreqs = Counter(words) words_len = len(words) text_Vec = [0] * embedDim if isSemantic == True: X = generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim) text_Vec = X[0] # clusterId=findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds) targetClusterIds = findTargetClusters(txtBitermsFreqs, dic_biterm__clusterIds) clusterId = findCloseClusterByTargetClusters( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds, targetClusterIds) c_textItems.setdefault(clusterId, []).append(item) max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)]) dic_clus__id[clusterId] = max_c_id txtId_txt[current_txt_id] = words c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds = populateClusterFeature( c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds) # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) # print('clusterId', clusterId, 'current_txt_id', current_txt_id, len(c_textItems), len(c_txtIds), words, len(targetClusterIds), len(dic_ngram__textItems)) eval_pred_true_txt.append([clusterId, item[1], item[2]]) if ignoreMinusOne == True: if str(item[1]) != '-1': f.write( str(clusterId) + " " + str(item[1]) + " " + str(' '.join(item[2])) + " " + postId + "\n") else: f.write( str(clusterId) + " " + str(item[1]) + " " + str(' '.join(item[2])) + " " + postId + "\n") if line_count % 500 == 0: # print(dic_clus__id) print(len(dic_clus__id)) # delete old and small clusters, remove multi-cluster words from clusters list_c_sizes = [] list_c_ids = [] # list_size__cid={} for c_id, txtIds in c_txtIds.items(): list_c_sizes.append(len(txtIds)) list_c_ids.append(dic_clus__id[c_id]) # list_size__cid[len(txtIds)]=c_id mean_c_size = 0 std_c_size = 0 if len(list_c_sizes) > 2: mean_c_size = statistics.mean(list_c_sizes) std_c_size = statistics.stdev(list_c_sizes) mean_c_id = 0 std_c_id = 0 if len(list_c_ids) > 2: mean_c_id = statistics.mean(list_c_ids) std_c_id = statistics.stdev(list_c_ids) print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size, 'std_c_size', std_c_size) print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id, 'std_c_id', std_c_id) list_del_cids = [] del_count = 0 for c_id, txtIds in c_txtIds.items(): c_size = len(txtIds) if ((c_size <= 1 or float(c_size) <= float(abs(mean_c_size - std_c_size))) or (float(c_size) >= mean_c_size + std_c_size)) or ( (float(c_id) <= float(abs(mean_c_id - std_c_id))) or (float(c_id) >= float(abs(mean_c_id + std_c_id)))): list_del_cids.append(c_id) list_del_cids = set(list_del_cids) print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)', len(c_bitermsFreqs)) listTargetBiterms = [] # need to uncomment for c_id in list_del_cids: if c_id in c_bitermsFreqs: # print('del c_id', c_id, len(c_bitermsFreqs[c_id])) del c_bitermsFreqs[c_id] if c_id in c_totalBiterms: del c_totalBiterms[c_id] if c_id in c_txtIds: del c_txtIds[c_id] if c_id in c_wordsFreqs: del c_wordsFreqs[c_id] if c_id in c_totalWords: del c_totalWords[c_id] if c_id in dic_clus__id: del dic_clus__id[c_id] if isSemantic == True: del c_clusterVecs[c_id] # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq) if line_count % 1000 == 0: print('#######-personal-eval_pred_true_txt', len(eval_pred_true_txt)) Evaluate(eval_pred_true_txt, ignoreMinusOne) t12 = datetime.now() t_diff = t12 - t11 print("total time diff secs=", t_diff.seconds) last_txtId = current_txt_id return [ c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds, c_textItems, dic_ngram__textItems ]
model = TransRec(sess, dataset) elif recommender.lower() == "cdae": model = CDAE(sess, dataset) elif recommender.lower() == "dae": model = DAE(sess, dataset) elif recommender.lower() == "npe": model = NPE(sess, dataset) elif recommender.lower() == "multidae": model = MultiDAE(sess, dataset) elif recommender.lower() == "multivae": model = MultiVAE(sess, dataset) elif recommender.lower() == "irgan": model = IRGAN(sess, dataset) elif recommender.lower() == "cfgan": model = CFGAN(sess, dataset) elif recommender.lower() == "jca": model = JCA(sess, dataset) model.build_graph() sess.run(tf.global_variables_initializer()) model.train_model() Evaluate.test_model(model, dataset, num_thread)
fileDir = os.path.dirname(os.path.abspath(__file__)) #print(fileDir) parentDir = os.path.dirname(fileDir) #print(parentDir) parentDir = os.path.dirname(parentDir) #print(parentDir) outputPath = "result/" trainingFile = outputPath + 'train_biterm_r.txt' trainList_pred_true_text_postid = ReadPredTrueTextPostid( trainingFile, ignoreMinusOne) print('result for', trainingFile) Evaluate(trainList_pred_true_text_postid) all_words = [] for item in trainList_pred_true_text_postid: all_words.extend(item[2].split(' ')) all_words = list(set(all_words)) gloveFile = "glove.6B.50d.txt" embedDim = 50 wordVectorsDic = {} if isSemantic == True: wordVectorsDic = extractAllWordVecsPartialStemming(gloveFile, embedDim, all_words) c_bitermsFreqs = {} c_totalBiterms = {}