def test_model(logdir, inputs, left_embedfile, right_embedfile, epochs=EPOCHS): """Train a classifier to label ASTs""" n_classess = 2 # left_algo_labels = ['mergesort', 'linkedlist', 'quicksort', 'bfs', 'bubblesort', 'knapsack'] # right_algo_labels = ['mergesort', 'linkedlist', 'quicksort', 'bfs', 'bubblesort', 'knapsack'] left_algo_labels = [ "bfs", "bubblesort", "knapsack", "linkedlist", "mergesort", "quicksort", "heap", "dfs", "stack", "queue" ] right_algo_labels = [ "bfs", "bubblesort", "knapsack", "linkedlist", "mergesort", "quicksort", "heap", "dfs", "stack", "queue" ] # with open(left_inputs, 'rb') as fh: # _, left_trees, left_algo_labels = pickle.load(fh) # with open(right_inputs, 'rb') as fh: # _, right_trees, right_algo_labels = pickle.load(fh) with open(inputs, "rb") as fh: testing_pairs = pickle.load(fh) print "Loading embdding vectors...." with open(left_embedfile, 'rb') as fh: left_embeddings, left_embed_lookup = pickle.load(fh) with open(right_embedfile, 'rb') as fh: right_embeddings, right_embed_lookup = pickle.load(fh) num_feats = len(left_embeddings[0]) # build the inputs and outputs of the network left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese( num_feats) right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese( num_feats) merge_node = tf.concat([left_pooling_node, right_pooling_node], -1) hidden_node = network.hidden_layer(merge_node, 200, 200) # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False) hidden_node = network.hidden_layer(hidden_node, 200, 200) # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False) hidden_node = network.hidden_layer(hidden_node, 200, n_classess) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) # tf.summary.scalar('loss', loss_node) ### init the graph sess = tf.Session() #config=tf.ConfigProto(device_count={'GPU':0})) sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 left_trees, right_trees = get_trees_from_pairs(testing_pairs) using_vector_lookup_left = False if os.path.isfile("/input/config.json"): file_handler = open(config_file, 'r') contents = json.load(file_handler) using_vector_lookup_left = contents[ 'using_vector_lookup_left'] == "false" correct_labels = [] predictions = [] print('Computing testing accuracy...') for left_gen_batch, right_gen_batch in sampling.batch_random_samples_2_sides( left_trees, left_algo_labels, right_trees, right_algo_labels, left_embeddings, left_embed_lookup, right_embeddings, right_embed_lookup, using_vector_lookup_left, False, TEST_BATCH_SIZE): left_nodes, left_children, left_labels_one_hot, left_labels = left_gen_batch right_nodes, right_children, right_labels_one_hot, right_labels = right_gen_batch sim_labels, _ = get_one_hot_similarity_label(left_labels, right_labels) print("sim labels : " + str(sim_labels)) output = sess.run( [out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: sim_labels }) correct = np.argmax(sim_labels[0]) predicted = np.argmax(output[0]) check = (correct == predicted) and True or False print('Out:', output, "Status:", check) correct_labels.append(np.argmax(sim_labels[0])) predictions.append(np.argmax(output[0])) target_names = ["0", "1"] print('Accuracy:', accuracy_score(correct_labels, predictions)) print( classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions))
def train_model(logdir, inputs, embeddings_list_url, node_map_url, epochs=EPOCHS, with_drop_out=1, device="0"): os.environ['CUDA_VISIBLE_DEVICES'] = device print("Using device : " + device) print("Batch size : " + str(BATCH_SIZE)) if int(with_drop_out) == 1: print("Training with drop out rate : " + str(DROP_OUT)) n_classess = 2 print("Loading training data....") # print "Using device : " + device with open(inputs, "rb") as fh: # all_1_pairs, all_0_pairs = pickle.load(fh) all_training_pairs = pickle.load(fh) random.shuffle(all_training_pairs) print("Loading embedding list.....") with open(embeddings_list_url, "rb") as embeddings_list_fh: embeddings_list = pickle.load(embeddings_list_fh) num_feats = len(embeddings_list[0]) print("number of features : " + str(num_feats)) print("Loading node map for looking up.....") with open(node_map_url, "rb") as node_map_fh: # all_1_pairs, all_0_pairs = pickle.load(fh) node_map = pickle.load(node_map_fh) # build the inputs and outputs of the network left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese( num_feats) right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese( num_feats) # with tf.device(device): print("Left pooling shape : " + str(tf.shape(left_pooling_node))) print("Right pooling shape : " + str(tf.shape(right_pooling_node))) merge_node = tf.concat([left_pooling_node, right_pooling_node], -1) print(tf.shape(merge_node)) hidden_node = network.hidden_layer(merge_node, 600, 300) if int(with_drop_out) == 1: hidden_node = tf.layers.dropout(hidden_node, rate=DROP_OUT, training=True) hidden_node = network.hidden_layer(hidden_node, 300, 100) if int(with_drop_out) == 1: hidden_node = tf.layers.dropout(hidden_node, rate=DROP_OUT, training=True) hidden_node = network.hidden_layer(hidden_node, 100, n_classess) if int(with_drop_out) == 1: hidden_node = tf.layers.dropout(hidden_node, rate=DROP_OUT, training=True) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) # tf.summary.scalar('loss', loss_node) # correct_prediction = tf.equal(tf.argmax(out_node,1), tf.argmax(labels_node,1)) # accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) ### init the graph # config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.allocator_type = 'BFC' # config.gpu_options.per_process_gpu_memory_fraction = 0.9 # config = tf.ConfigProto() # config.gpu_options.allow_growth = True config = tf.ConfigProto() # config.gpu_options.allocator_type ='BFC' # config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.98 sess = tf.Session(config=config) # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() summaries = tf.summary.merge_all() writer = tf.summary.FileWriter(logdir, sess.graph) ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 using_vector_lookup_left = False if os.path.isfile("/input/config.json"): file_handler = open(config_file, 'r') contents = json.load(file_handler) using_vector_lookup_left = contents[ 'using_vector_lookup_left'] == "false" print("Begin training....") # with tf.device(device): for epoch in range(1, epochs + 1): # sample_1_pairs = random.sample(all_1_pairs,1000) # sample_0_pairs = random.sample(all_0_pairs,1000) # sample_training_pairs = random.sample(all_training_pairs,6400) shuffle_left_trees, shuffle_right_trees, labels = get_trees_from_pairs( all_training_pairs) print("Len left:", len(shuffle_left_trees), "Len right:", len(shuffle_right_trees)) for left_gen_batch, right_gen_batch, labels_batch in sampling.batch_random_samples_2_sides( shuffle_left_trees, shuffle_right_trees, embeddings_list, node_map, labels, BATCH_SIZE): print("----------------------------------------------------") print("Len of label batch : " + str(labels_batch)) left_nodes, left_children = left_gen_batch right_nodes, right_children = right_gen_batch sim_labels, sim_labels_num = get_one_hot_similarity_label( labels_batch) # print("sim labels : " + str(sim_labels)) _, err, out, merge, labs = sess.run( [train_step, loss_node, out_node, merge_node, labels_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: sim_labels }) # print "hidden : " + str(loss) print('Epoch:', epoch, 'Steps:', steps, 'Loss:', err, "True Label vs Predicted Label:", zip(labs, out)) # print('Epoch:', epoch,'Steps:', steps,'Loss:', err) if steps % CHECKPOINT_EVERY == 0: # save state so we can resume later saver.save(sess, os.path.join(checkfile), steps) print('Checkpoint saved.') steps += 1 steps = 0
def train_model(train_dataloader, val_dataloader, embeddings, embedding_lookup, opt): logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) train_left_trees = train_dataloader.left_trees train_right_trees = train_dataloader.right_trees train_labels = train_dataloader.labels val_left_trees = val_dataloader.left_trees val_right_trees = val_dataloader.right_trees val_labels = val_dataloader.labels n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention" : tf.Variable(initializer([opt.feature_size,1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([opt.feature_size,]), name="b_conv"), } left_nodes_node, left_children_node, right_nodes_node, right_children_node, hidden_node, left_score_node, right_score_node = network.init_net_for_siamese( num_feats, opt.feature_size, weights, biases, opt.aggregation, opt.distributed_function ) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin training....") # with tf.device(device): for epoch in range(1, epochs+1): print("----------------------------------------------------") for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(train_left_trees, train_right_trees, train_labels, embeddings, embedding_lookup, opt.train_batch_size,opt.batch_type): if opt.batch_type == "original": left_nodes, left_children = batch_left_trees right_nodes, right_children = batch_right_trees else: left_nodes, left_children, _ = batch_left_trees right_nodes, right_children, _ = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) _, err, out = sess.run( [train_step, loss_node, out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot } ) # print "hidden : " + str(loss) print('Epoch:', epoch,'Steps:', steps,'Loss:', err) if steps % CHECKPOINT_EVERY == 0: # save state so we can resume later saver.save(sess, os.path.join(checkfile), steps) print('Checkpoint saved.') steps+=1 steps = 0 correct_labels = [] predictions = [] for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(val_left_trees, val_right_trees, val_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): if opt.batch_type == "original": left_nodes, left_children = batch_left_trees right_nodes, right_children = batch_right_trees else: left_nodes, left_children, _ = batch_left_trees right_nodes, right_children, _ = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) output = sess.run( [out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot } ) correct = np.argmax(labels_one_hot, axis=1) predicted = np.argmax(output[0], axis=1) correct_labels.extend(correct) predictions.extend(predicted) print('Accuracy:', accuracy_score(correct_labels, predictions)) print(classification_report(correct_labels, predictions)) print(confusion_matrix(correct_labels, predictions))
def test_model(test_dataloader, embeddings, embedding_lookup, opt): logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) test_left_trees = test_dataloader.left_trees test_right_trees = test_dataloader.right_trees test_labels = test_dataloader.labels n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention": tf.Variable(initializer([opt.feature_size, 1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([ opt.feature_size, ]), name="b_conv"), } left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_2( num_feats, opt.feature_size, weights, biases, ) out_node = network.out_layer(logits_node) labels_node, loss_node = network.loss_layer(logits_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin computing accuracy....") correct_labels = [] predictions = [] for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( test_left_trees, test_right_trees, test_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): left_nodes, left_children, left_masks = batch_left_trees right_nodes, right_children, right_masks = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) matching_matrices, output = sess.run( [attention_matrix_nodes, out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot, left_mask_nodes: left_masks, right_mask_nodes: right_masks, }) for i, matrix in enumerate(matching_matrices): np.savetxt("matching_matrix/" + str(i) + ".csv", matrix, delimiter=",") print(output) print(output.shape) correct = np.argmax(labels_one_hot, axis=1) predicted = np.argmax(output, axis=1) correct_labels.extend(correct) predictions.extend(predicted) print('Accuracy:', accuracy_score(correct_labels, predictions)) print(classification_report(correct_labels, predictions)) print(confusion_matrix(correct_labels, predictions))
def train_model(train_dataloader, val_dataloader, embeddings, embedding_lookup, opt): logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) train_left_trees = train_dataloader.left_trees train_right_trees = train_dataloader.right_trees train_labels = train_dataloader.labels val_left_trees = val_dataloader.left_trees val_right_trees = val_dataloader.right_trees val_labels = val_dataloader.labels n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_3( num_feats, opt.feature_size) out_node = network.out_layer(logits_node) labels_node, loss_node = network.loss_layer(logits_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin training....") # with tf.device(device): # temp_precision = 0.0 # temp_recall = 0.0 temp_accuracy = 0.0 for epoch in range(1, epochs + 1): for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( train_left_trees, train_right_trees, train_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): left_nodes, left_children, left_masks = batch_left_trees right_nodes, right_children, right_masks = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) _, err, left_nodes_out, out = sess.run( [train_step, loss_node, left_nodes_node, out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot, left_mask_nodes: left_masks, right_mask_nodes: right_masks, }) print('Epoch:', epoch, 'Steps:', steps, 'Loss:', err, "Val Accuracy:", temp_accuracy) print(left_nodes_out.shape) if steps % CHECKPOINT_EVERY == 0: print("Checkpoint, validating.....") correct_labels = [] predictions = [] for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( val_left_trees, val_right_trees, val_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): left_nodes, left_children, left_masks = batch_left_trees right_nodes, right_children, right_masks = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) output = sess.run( [out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot, left_mask_nodes: left_masks, right_mask_nodes: right_masks, }) correct = np.argmax(labels_one_hot, axis=1) predicted = np.argmax(output[0], axis=1) correct_labels.extend(correct) predictions.extend(predicted) accuracy = float(accuracy_score(correct_labels, predictions)) precision = float(precision_score(correct_labels, predictions)) recall = float(recall_score(correct_labels, predictions)) print('Accuracy:', accuracy) print(classification_report(correct_labels, predictions)) print(confusion_matrix(correct_labels, predictions)) if accuracy > temp_accuracy: temp_accuracy = accuracy with open("no_tbcnn_validation.txt", "w") as f: f.write(str(temp_accuracy)) # save state so we can resume later saver.save(sess, os.path.join(checkfile), steps) print('Checkpoint saved.') steps += 1 steps = 0
def predict(sess, index, left_path, right_path, pairs, left_node_ids_list, right_node_ids_list, out_node, left_score_node, right_score_node, left_nodes_node, left_children_node, right_nodes_node, right_children_node, labels_node, embeddings, embedding_lookup, opt): test_left_trees = [] test_right_trees = [] test_labels = [] test_left_trees.append(pairs[0][0]) test_right_trees.append(pairs[0][1]) test_labels.append(pairs[0][2]) for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( test_left_trees, test_right_trees, test_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): if opt.batch_type == "original": left_nodes, left_children = batch_left_trees right_nodes, right_children = batch_right_trees else: left_nodes, left_children, _ = batch_left_trees right_nodes, right_children, _ = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) output, left_attention_score, right_attention_score = sess.run( [out_node, left_score_node, right_score_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot }) left_scaled_attention_score = scale_attention(left_attention_score, len(left_nodes[0]), left_node_ids_list) right_scaled_attention_score = scale_attention(right_attention_score, len(right_nodes[0]), right_node_ids_list) left_pb_path = left_path.replace( "github_java_sort_function_pkl_train_test_val", "github_java_sort_function_pb").replace(".pkl", "").replace("/test", "") right_pb_path = right_path.replace( "github_java_sort_function_pkl_train_test_val", "github_java_sort_function_pb").replace(".pkl", "").replace("/test", "") left_java_path = left_path.replace( "github_java_sort_function_pkl_train_test_val", "github_java_sort_function").replace(".pb.pkl", "").replace("/test", "") right_java_path = right_path.replace( "github_java_sort_function_pkl_train_test_val", "github_java_sort_function").replace(".pb.pkl", "").replace("/test", "") # print(cmd) temp_left_java_path = left_java_path.split("/")[-1] temp_right_java_path = right_java_path.split("/")[-1] temp_left_pb_path = left_pb_path.split("/")[-1] temp_right_pb_path = right_pb_path.split("/")[-1] # print(left_pb_path) # print(right_pb_path) from shutil import copyfile copyfile(left_java_path, temp_left_java_path) copyfile(right_java_path, temp_right_java_path) generate_pb(temp_left_java_path) generate_pb(temp_right_java_path) save_file_normal = "github_java_pairwise_visualization/" + "pair_" + str( index) + "_normal.html" save_file_spread = "github_java_pairwise_visualization/" + "pair_" + str( index) + "_spread.html" save_file_spread_subtree = "github_java_pairwise_visualization/" + "pair_" + str( index) + "_spread_subtree.html" cmd_normal = "docker run -v $(pwd):/e yijun/fast -H 0 -a 0 --diff-weight --linear " + " -x " + "'" + left_scaled_attention_score + "'" + " -y " + "'" + right_scaled_attention_score + "'" + " -D " + temp_left_pb_path + " " + temp_right_pb_path + " > " + save_file_normal # cmd_spread = "docker run -v $(pwd):/e yijun/fast -H 0 -a 1 --diff-weight --linear -D " + temp_left_pb_path + " " + temp_right_pb_path + " -x " + "'" + left_scaled_attention_score + "'" + " -y " + "'" + right_scaled_attention_score + "'" + " > " + save_file_spread # cmd_spread_with_sub_tree = "docker run -v $(pwd):/e yijun/fast -H 0 -a 2 --diff-weight --linear -D " + temp_left_pb_path + " " + temp_right_pb_path + " -x " + "'" + left_scaled_attention_score + "'" + " -y " + "'" + right_scaled_attention_score + "'" + " > " + save_file_spread_subtree # cmd = "docker run -v $(pwd):/e yijun/fast -D -H -x " + "'" + left_scaled_attention_score + "'" + " -y " + "'" + right_scaled_attention_score + "'" + " -p " + temp_left_path + " " + temp_right_path print(cmd_normal) os.system(cmd_normal) # os.system(cmd_spread) # os.system(cmd_spread_with_sub_tree) # gumtree_cmd = "docker run -v $(pwd):/e --entrypoint gumtree -it yijun/fast diff " + temp_left_path + " " + temp_right_path # print(gumtree_cmd) # os.system(gumtree_cmd) if os.path.exists(temp_left_java_path): os.remove(temp_left_java_path) if os.path.exists(temp_right_java_path): os.remove(temp_right_java_path)
def test_model(test_dataloader, embeddings, embedding_lookup, opt): logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) test_left_trees = test_dataloader.left_trees test_right_trees = test_dataloader.right_trees test_labels = test_dataloader.labels test_left_node_ids_list = test_dataloader.left_node_ids_list test_right_node_ids_list = test_dataloader.right_node_ids_list print(test_left_node_ids_list) print("Num id left : " + str(len(test_left_node_ids_list[0]))) print("Num id right : " + str(len(test_right_node_ids_list[0]))) n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention": tf.Variable(initializer([opt.feature_size, 1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([ opt.feature_size, ]), name="b_conv"), } left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_2( num_feats, opt.feature_size, weights, biases, ) # left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_3( # num_feats, # opt.feature_size # ) out_node = network.out_layer(logits_node) labels_node, loss_node = network.loss_layer(logits_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin computing accuracy....") correct_labels = [] predictions = [] for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( test_left_trees, test_right_trees, test_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): left_nodes, left_children, left_masks = batch_left_trees right_nodes, right_children, right_masks = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) matching_matrices, output = sess.run( [attention_matrix_nodes, out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot, left_mask_nodes: left_masks, right_mask_nodes: right_masks, }) matrix = matching_matrices[0] if len(test_left_node_ids_list[0]) < len(test_right_node_ids_list[0]): matrix = matrix[:len(test_left_node_ids_list[0]), ...] else: matrix = matrix[..., :len(test_right_node_ids_list[0])] matrix_pd = pd.DataFrame(data=matrix, index=test_left_node_ids_list[0], columns=test_right_node_ids_list[0]) matrix_pd.to_csv("live_test/github_pairwise_java/sort/1_matrix.csv", sep=",") left_matrix_aggregate_idx = matrix_pd.idxmax(axis=1) left_matrix_aggregate_idx.to_csv( "live_test/github_pairwise_java/sort/left_aggregate_attention_idx.csv", sep=",") right_matrix_aggregate_idx = matrix_pd.idxmax(axis=0) right_matrix_aggregate_idx.to_csv( "live_test/github_pairwise_java/sort/right_aggregate_attention_idx.csv", sep=",") left_matrix_aggregate = matrix_pd.max(axis=1) left_matrix_aggregate.to_csv( "live_test/github_pairwise_java/sort/left_aggregate_attention.csv", sep=",") left_matrix_max_dict = left_matrix_aggregate.to_dict() right_matrix_aggregate = matrix_pd.max(axis=0) right_matrix_aggregate.to_csv( "live_test/github_pairwise_java/sort/right_aggregate_attention.csv", sep=",") right_matrix_max_dict = right_matrix_aggregate.to_dict() left_scaled_attention_map = scale_attention(left_matrix_max_dict) right_scaled_attention_map = scale_attention(right_matrix_max_dict) left_attention_path = "live_test/github_pairwise_java/sort/left_attention_scaled.csv" left_pb_path = "github_java_sort_function_pb/5/3.java.pb" left_normal_html_path = "live_test/github_pairwise_java/sort/left_normal.html" with open( "live_test/github_pairwise_java/sort/left_attention_scaled.csv", "w") as f1: for key, score in enumerate(left_scaled_attention_map): line = str(key) + "," + str(score) f1.write("%s\n" % line) normal_cmd = "docker run --rm -v $(pwd):/e -it yijun/fast -H 0 -t -x " + left_attention_path + " " + left_pb_path + " > " + left_normal_html_path print(normal_cmd) os.system(normal_cmd) right_attention_path = "live_test/github_pairwise_java/sort/right_attention_scaled.csv" right_pb_path = "github_java_sort_function_pb/5/105.java.pb" right_normal_html_path = "live_test/github_pairwise_java/sort/right_normal.html" with open( "live_test/github_pairwise_java/sort/right_attention_scaled.csv", "w") as f1: for key, score in enumerate(right_scaled_attention_map): line = str(key) + "," + str(score) f1.write("%s\n" % line) normal_cmd = "docker run --rm -v $(pwd):/e -it yijun/fast -H 0 -t -x " + right_attention_path + " " + right_pb_path + " > " + right_normal_html_path print(normal_cmd) os.system(normal_cmd) print(output) print(labels_one_hot) correct = np.argmax(labels_one_hot, axis=1) predicted = np.argmax(output, axis=1) correct_labels.extend(correct) predictions.extend(predicted) print('Accuracy:', accuracy_score(correct_labels, predictions)) print(classification_report(correct_labels, predictions)) print(confusion_matrix(correct_labels, predictions))