def making_prediction(test_dataset, ggnn, sess, opt, original=False): # For debugging purpose nodes_representation = ggnn.nodes_representation graph_representation = ggnn.graph_representation logits = ggnn.logits softmax_values = ggnn.softmax_values attention_scores = ggnn.attention_scores batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5) correct_labels = [] predictions = [] attention_scores_data = [] softmax_values_data = [] print("--------------------------------------") print('Computing training accuracy...') for step, batch_data in enumerate(batch_iterator): # print(batch_data["labels"]) print(batch_data['labels']) softmax_values_data, attention_scores_data = sess.run( [softmax_values, attention_scores], feed_dict={ ggnn.placeholders["initial_node_representation"]: batch_data["initial_representations"], ggnn.placeholders["num_vertices"]: batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: batch_data['labels'] }) correct_labels.extend(np.argmax(batch_data['labels'], axis=1)) argmax = np.argmax(softmax_values_data, axis=1) predictions.extend(np.argmax(softmax_values_data, axis=1)) print("Probability : " + str(softmax_values_data)) print("Probability max : " + str(np.argmax(softmax_values_data, axis=1))) print("Correct class " + str(correct_labels)) print("Predicted class : " + str(predictions)) scaled_attention_scores_path, raw_attention_scores_path, raw_attention_scores_dict = generate_attention_scores( opt.test_file, attention_scores_data[0]) prediction_results = {} prediction_results[ "scaled_attention_scores_path"] = scaled_attention_scores_path prediction_results["raw_attention_scores_path"] = raw_attention_scores_path prediction_results["raw_attention_scores_dict"] = raw_attention_scores_dict prediction_results["softmax_values_data"] = softmax_values_data prediction_results["predicted_label"] = argmax[0] prediction_results["correct_label"] = np.argmax(batch_data['labels'], axis=1) return prediction_results
def making_prediction(test_dataset, ggnn, sess, opt): # For debugging purpose nodes_representation = ggnn.nodes_representation graph_representation = ggnn.graph_representation logits = ggnn.logits softmax_values = ggnn.softmax_values attention_scores = ggnn.attention_scores batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5) correct_labels = [] predictions = [] print("--------------------------------------") print('Computing training accuracy...') for step, batch_data in enumerate(batch_iterator): # print(batch_data["labels"]) print(batch_data['labels']) softmax_values_data, attention_scores_data = sess.run( [softmax_values, attention_scores], feed_dict={ ggnn.placeholders["initial_node_representation"]: batch_data["initial_representations"], ggnn.placeholders["num_vertices"]: batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: batch_data['labels'] } ) # print(attention_scores_data) # print(len(attention_scores_data[0])) correct_labels.extend(np.argmax(batch_data['labels'],axis=1)) argmax = np.argmax(softmax_values_data,axis=1) predictions.extend(np.argmax(softmax_values_data,axis=1)) print("Probability : " + str(softmax_values_data)) print("Probability max : " + str(np.argmax(softmax_values_data,axis=1))) print("Correct class " + str(correct_labels[0])) print("Predicted class : " + str(predictions[0])) attention_path, raw_attention_score_dict = generate_attention_scores(opt, attention_scores_data[0]) generate_subtree(opt, opt.stmt_ids_path, raw_attention_score_dict) print(attention_path) print(opt.pb_path) generate_visualization(opt.pb_path,attention_path) return softmax_values_data, argmax, str(correct_labels[0]), str(predictions[0])
def making_prediction(graph_path, opt, ggnn, sess): # For debugging purpose softmax_values = ggnn.softmax_values attention_scores = ggnn.attention_scores opt.test_graph_path = graph_path test_dataset = MonoLanguageProgramData(opt, False, False, True) batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5) for step, batch_data in enumerate(batch_iterator): # print(batch_data["labels"]) print(batch_data['labels']) softmax_values_data, attention_scores_data = sess.run( [softmax_values, attention_scores], feed_dict={ ggnn.placeholders["initial_node_representation"]: batch_data["initial_representations"], ggnn.placeholders["num_vertices"]: batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: batch_data['labels'] } ) predicted_label = np.argmax(softmax_values_data,axis=1) print("Probability : " + str(softmax_values_data)) print("Probability max : " + str(np.argmax(softmax_values_data,axis=1))) return predicted_label[0]
def main(opt): with open(opt.pretrained_embeddings_url, 'rb') as fh: embeddings, embed_lookup = pickle.load(fh,encoding='latin1') opt.pretrained_embeddings = embeddings opt.pretrained_embed_lookup = embed_lookup print("Finished loading pretrained embeddings......") checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(opt.model_path) test_dataset = MonoLanguageProgramData(opt, False, True) opt.n_edge_types = test_dataset.n_edge_types print("Num edge types : " + str(opt.n_edge_types)) ggnn = DenseGGNNModel(opt) # For debugging purpose nodes_representation = ggnn.nodes_representation graph_representation = ggnn.graph_representation logits = ggnn.logits softmax_values = ggnn.softmax_values saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) correct_labels = [] predictions = [] print('Computing training accuracy...') batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5) for step, batch_data in enumerate(batch_iterator): # print(batch_data["labels"]) softmax_values_data = sess.run( [softmax_values], feed_dict={ ggnn.placeholders["initial_node_representation"]: batch_data["initial_representations"], ggnn.placeholders["num_vertices"]: batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: batch_data['labels'] } ) correct_labels.extend(np.argmax(batch_data['labels'],axis=1)) predictions.extend(np.argmax(softmax_values_data[0],axis=1)) print("Num target : " + str(len(correct_labels))) # print(correct_labels) # print(predictions) target_names = [str(i) for i in range(1,11)] print('Accuracy:', accuracy_score(correct_labels, predictions)) print(classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--workers', type=int, help='number of data loading workers', default=2) parser.add_argument('--train_batch_size', type=int, default=10, help='input batch size') parser.add_argument('--test_batch_size', type=int, default=5, help='input batch size') parser.add_argument('--state_dim', type=int, default=30, help='GGNN hidden state dimension size') parser.add_argument('--node_dim', type=int, default=100, help='node dimension size') parser.add_argument('--hidden_layer_size', type=int, default=200, help='size of hidden layer') parser.add_argument('--num_hidden_layer', type=int, default=1, help='number of hidden layer') parser.add_argument('--n_steps', type=int, default=10, help='propogation steps number of GGNN') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--cuda', action='store_true', help='enables cuda') parser.add_argument('--verbal', type=bool, default=True, help='print training info or not') parser.add_argument('--manualSeed', type=int, help='manual seed') parser.add_argument( '--test_file', default="program_data/test_data/5/100_dead_code_1.java", help="test program") parser.add_argument('--n_classes', type=int, default=10, help='manual seed') parser.add_argument('--path', default="program_data/github_java_sort_function_babi", help='program data') parser.add_argument('--model_path', default="model", help='path to save the model') parser.add_argument('--n_hidden', type=int, default=50, help='number of hidden layers') parser.add_argument('--size_vocabulary', type=int, default=59, help='maximum number of node types') parser.add_argument('--log_path', default="logs/", help='log path for tensorboard') parser.add_argument( '--aggregation', type=int, default=1, choices=range(0, 4), help= '0 for max pooling, 1 for attention with sum pooling, 2 for attention with max pooling, 3 for attention with average pooling' ) parser.add_argument('--distributed_function', type=int, default=0, choices=range(0, 2), help='0 for softmax, 1 for sigmoid') parser.add_argument( '--pretrained_embeddings_url', default="embedding/fast_pretrained_vectors.pkl.gz", help= 'pretrained embeddings url, there are 2 objects in this file, the first object is the embedding matrix, the other is the lookup dictionary' ) parser.add_argument('argv', nargs="+", help='filenames') opt = parser.parse_args() print(opt) opt.model_path = os.path.join( opt.model_path, "sum_softmax" + "_hidden_layer_size_" + str(opt.hidden_layer_size) + "_num_hidden_layer_" + str(opt.num_hidden_layer)) + "_node_dim_" + str(opt.node_dim) if len(opt.argv) == 1: opt.test_file = opt.argv[0] # Create model path folder if not exists if not os.path.exists(opt.model_path): print("Cannot find path : " + opt.model_path) generate_files(opt, opt.test_file) # if not os.path.exists(opt.pretrained_embeddings_url): # fetch_data_from_github(opt.pretrained_embeddings_url) with gzip.open(opt.pretrained_embeddings_url, 'rb') as fh: embeddings, embed_lookup = pickle.load(fh, encoding='latin1') opt.pretrained_embeddings = embeddings opt.pretrained_embed_lookup = embed_lookup checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt') # for f in ['checkpoint', 'cnn_tree.ckpt.index', 'cnn_tree.ckpt.meta', 'cnn_tree.ckpt.data-00000-of-00001']: # filename = os.path.join(opt.model_path, f) # if not os.path.exists(filename): # fetch_data_from_github(filename) ckpt = tf.train.get_checkpoint_state(opt.model_path) test_dataset = MonoLanguageProgramData(opt, False, False, True) # opt.n_edge_types = test_dataset.n_edge_types opt.n_edge_types = 7 ggnn = DenseGGNNModel(opt) # For debugging purpose nodes_representation = ggnn.nodes_representation graph_representation = ggnn.graph_representation logits = ggnn.logits softmax_values = ggnn.softmax_values attention_scores = ggnn.attention_scores saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) correct_labels = [] predictions = [] print('Computing training accuracy...') batch_iterator = ThreadedIterator( test_dataset.make_minibatch_iterator(), max_queue_size=5) for step, batch_data in enumerate(batch_iterator): # print(batch_data["labels"]) softmax_values_data, attention_scores_data = sess.run( [softmax_values, attention_scores], feed_dict={ ggnn.placeholders["initial_node_representation"]: batch_data["initial_representations"], ggnn.placeholders["num_vertices"]: batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: batch_data['labels'] }) print(softmax_values_data) # print(attention_scores_data) # print(len(attention_scores_data[0])) correct_labels.extend(np.argmax(batch_data['labels'], axis=1)) predictions.extend(np.argmax(softmax_values_data, axis=1)) print("Num target : " + str(len(correct_labels))) print("True label : " + str(correct_labels[0])) print("Predicted label : " + str(predictions[0])) attention_path = generate_attention_scores(opt, attention_scores_data[0]) print(attention_path) print(opt.pb_path) generate_visualization(opt.pb_path, attention_path)
def main(opt): opt.model_path = os.path.join(opt.model_path, form_model_path(opt)) checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(opt.model_path) print("The model path : " + str(checkfile)) print("Loss : " + str(opt.loss)) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model : " + str(checkfile)) print("Loading vocabs.........") node_type_lookup, node_token_lookup, subtree_lookup = load_vocabs(opt) opt.node_type_lookup = node_type_lookup opt.node_token_lookup = node_token_lookup opt.subtree_lookup = subtree_lookup if opt.task == 1: train_dataset = CodeClassificationData(opt, True, False, False) if opt.task == 0: val_opt = copy.deepcopy(opt) val_opt.node_token_lookup = node_token_lookup validation_dataset = CodeClassificationData(val_opt, False, False, True) print("Initializing tree caps model...........") corder = CorderModel(opt) print("Finished initializing corder model...........") loss_node = corder.loss optimizer = RAdamOptimizer(opt.lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): training_point = optimizer.minimize(loss_node) saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) init = tf.global_variables_initializer() # best_f1_score = get_best_f1_score(opt) # print("Best f1 score : " + str(best_f1_score)) with tf.Session() as sess: sess.run(init) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) if opt.task == 1: for epoch in range(1, opt.epochs + 1): train_batch_iterator = ThreadedIterator( train_dataset.make_minibatch_iterator(), max_queue_size=opt.worker) train_accs = [] for train_step, train_batch_data in enumerate( train_batch_iterator): print("--------------------------") # print(train_batch_data["batch_subtrees_ids"]) logging.info(str(train_batch_data["batch_subtree_id"])) _, err = sess.run( [training_point, corder.loss], feed_dict={ corder.placeholders["node_types"]: train_batch_data["batch_node_types"], corder.placeholders["node_tokens"]: train_batch_data["batch_node_tokens"], corder.placeholders["children_indices"]: train_batch_data["batch_children_indices"], corder.placeholders["children_node_types"]: train_batch_data["batch_children_node_types"], corder.placeholders["children_node_tokens"]: train_batch_data["batch_children_node_tokens"], corder.placeholders["labels"]: train_batch_data["batch_subtree_id"], corder.placeholders["dropout_rate"]: 0.3 }) logging.info("Training at epoch " + str(epoch) + " and step " + str(train_step) + " with loss " + str(err)) print("Epoch:", epoch, "Step:", train_step, "Training loss:", err) if train_step % opt.checkpoint_every == 0 and train_step > 0: saver.save(sess, checkfile) print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' + str(train_step) + ', loss: ' + str(err) + '.') if opt.task == 0: validation_batch_iterator = ThreadedIterator( validation_dataset.make_minibatch_iterator(), max_queue_size=opt.worker) for val_step, val_batch_data in enumerate( validation_batch_iterator): scores = sess.run( [corder.code_vector], feed_dict={ corder.placeholders["node_types"]: val_batch_data["batch_node_types"], corder.placeholders["node_tokens"]: val_batch_data["batch_node_tokens"], corder.placeholders["children_indices"]: val_batch_data["batch_children_indices"], corder.placeholders["children_node_types"]: val_batch_data["batch_children_node_types"], corder.placeholders["children_node_tokens"]: val_batch_data["batch_children_node_tokens"], corder.placeholders["dropout_rate"]: 0.0 }) for i, vector in enumerate(scores[0]): file_name = "analysis/rosetta_sampled_softmax_train.csv" with open(file_name, "a") as f: vector_score = [] for score in vector: vector_score.append(str(score)) # print(val_batch_data["batch_file_path"]) line = str(val_batch_data["batch_file_path"] [i]) + "," + " ".join(vector_score) f.write(line) f.write("\n")
def main(opt): with open(opt.pretrained_embeddings_url, 'rb') as fh: embeddings, embed_lookup = pickle.load(fh,encoding='latin1') opt.pretrained_embeddings = embeddings opt.pretrained_embed_lookup = embed_lookup checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(opt.model_path) train_dataset = MonoLanguageProgramData(opt, True, False) test_dataset = MonoLanguageProgramData(opt, False, True) opt.n_edge_types = train_dataset.n_edge_types ggnn = DenseGGNNModel(opt) # For debugging purpose nodes_representation = ggnn.nodes_representation graph_representation = ggnn.graph_representation logits = ggnn.logits softmax_values = ggnn.softmax_values attention_scores = ggnn.attention_scores loss_node = ggnn.loss optimizer = tf.train.AdamOptimizer(opt.lr) training_point = optimizer.minimize(loss_node) saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) init = tf.global_variables_initializer() # with open("model_selection.txt","r") as f: with tf.Session() as sess: sess.run(init) print("List of available devices..........") print(tf.test.gpu_device_name()) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) best_accuracy = opt.best_accuracy for epoch in range(1, opt.epochs + 1): train_batch_iterator = ThreadedIterator(train_dataset.make_minibatch_iterator(), max_queue_size=5) for train_step, train_batch_data in enumerate(train_batch_iterator): # print(batch_data["labels"]) _ , err, softmax_values_data, attention_scores_data = sess.run( [training_point, loss_node, softmax_values, attention_scores], feed_dict={ ggnn.placeholders["initial_node_representation"]: train_batch_data["initial_representations"], ggnn.placeholders["num_vertices"]: train_batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: train_batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: train_batch_data['labels'] } ) print("Epoch:", epoch, "Step:",train_step,"Loss:", err, "Best Accuracy:", best_accuracy) if train_step % opt.checkpoint_every == 0: # saver.save(sess, checkfile) # print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' + str(step) + ', loss: ' + str(err) + '.') # Validating #-------------------------------------- print("Validating.......") correct_labels = [] predictions = [] test_batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5) for test_step, test_batch_data in enumerate(test_batch_iterator): softmax_values_data = sess.run( [softmax_values], feed_dict={ ggnn.placeholders["initial_node_representation"]: test_batch_data["initial_representations"], ggnn.placeholders["num_vertices"]: test_batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: test_batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: test_batch_data['labels'] } ) correct_labels.extend(np.argmax(test_batch_data['labels'],axis=1)) predictions.extend(np.argmax(softmax_values_data[0],axis=1)) print("Num target : " + str(len(correct_labels))) print(correct_labels) print(predictions) target_names = [str(i) for i in range(1,11)] accuracy = float(accuracy_score(correct_labels, predictions)) print('Accuracy:', accuracy) print(classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions)) if accuracy > best_accuracy: best_accuracy = accuracy saver.save(sess, checkfile) print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' + str(train_step) + ', loss: ' + str(err) + '.')
def main(opt): opt.model_path = os.path.join(opt.model_path, form_model_path(opt)) checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(opt.model_path) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model : " + str(checkfile)) print("Loading vocabs.........") train_label_lookup, node_type_lookup, node_token_lookup, val_label_lookup = load_vocabs(opt) opt.label_lookup = train_label_lookup opt.label_size = len(train_label_lookup.keys()) opt.node_type_lookup = node_type_lookup opt.node_token_lookup = node_token_lookup if opt.task == 1: train_dataset = MethodNamePredictionData(opt, opt.train_path, True, False, False) val_opt = copy.deepcopy(opt) val_opt.label_lookup = val_label_lookup val_opt.num_labels = len(val_label_lookup.keys()) val_opt.node_token_lookup = node_token_lookup validation_dataset = MethodNamePredictionData(val_opt, opt.val_path, False, False, True) print("Initializing tree caps model...........") treecaps = TreeCapsModel(opt) # network.init_net_treecaps(30,30) print("Finished initializing tree caps model...........") code_caps = treecaps.code_caps loss_node = treecaps.loss softmax_values = treecaps.softmax_values logits = treecaps.logits optimizer = RAdamOptimizer(opt.lr) # optimizer = tf.compat.v1.train.AdamOptimizer(opt.lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): training_point = optimizer.minimize(loss_node) saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) init = tf.global_variables_initializer() best_f1_score = get_best_f1_score(opt) print("Best f1 score : " + str(best_f1_score)) num_caps_top_a = int(opt.num_conv*opt.output_size/opt.num_channel)*opt.top_a with tf.Session() as sess: sess.run(init) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) validation_batch_iterator = ThreadedIterator(validation_dataset.make_minibatch_iterator(), max_queue_size=5) # f1_scores_of_val_data = [] all_predicted_labels = [] all_ground_truth_labels = [] for val_step, val_batch_data in enumerate(validation_batch_iterator): alpha_IJ_shape = (opt.batch_size, int(num_caps_top_a/opt.top_a*val_batch_data["batch_node_types"].shape[1]), num_caps_top_a) alpha_IJ = np.zeros(alpha_IJ_shape) scores, alpha_IJ_scores= sess.run( [logits, treecaps.alpha_IJ], feed_dict={ treecaps.placeholders["node_types"]: val_batch_data["batch_node_types"], treecaps.placeholders["node_tokens"]: val_batch_data["batch_node_tokens"], treecaps.placeholders["children_indices"]: val_batch_data["batch_children_indices"], treecaps.placeholders["children_node_types"]: val_batch_data["batch_children_node_types"], treecaps.placeholders["children_node_tokens"]: val_batch_data["batch_children_node_tokens"], treecaps.placeholders["labels"]: val_batch_data["batch_labels"], treecaps.placeholders["alpha_IJ"]: alpha_IJ, treecaps.placeholders["is_training"]: False } ) alpha_IJ_scores = np.reshape(alpha_IJ_scores, (opt.batch_size, val_batch_data["batch_node_types"].shape[1], 8, opt.top_a, 8)) alpha_IJ_scores = np.sum(alpha_IJ_scores, axis=2) alpha_IJ_scores = np.sum(alpha_IJ_scores, axis=3) alpha_IJ_scores = np.squeeze(alpha_IJ_scores, axis=0) alpha_IJ_scores = np.transpose(alpha_IJ_scores) predictions = np.argmax(scores, axis=1) ground_truths = np.argmax(val_batch_data['batch_labels'], axis=1) predicted_labels = [] for prediction in predictions: predicted_labels.append(train_label_lookup.inverse[prediction]) ground_truth_labels = [] for ground_truth in ground_truths: ground_truth_labels.append( val_label_lookup.inverse[ground_truth]) f1_score = evaluation.calculate_f1_scores(predicted_labels, ground_truth_labels) print(ground_truth_labels) print(predicted_labels) print("F1:", f1_score, "Step:", val_step) if f1_score > 0: node_types = val_batch_data["batch_node_types"][0] node_tokens_text = val_batch_data["batch_node_tokens_text"][0] node_indexes = val_batch_data["batch_node_indexes"][0] file_path = val_batch_data["batch_file_path"][0] file_path_splits = file_path.split("/") file_path_splits[1] = "java-small" file_path_splits[len(file_path_splits) - 1] = file_path_splits[len(file_path_splits) - 1].replace(".pkl",".java") file_path = "/".join(file_path_splits) analysis_folder = os.path.join("analysis", "_".join(file_path_splits[-2:]).replace(".java","")) try: from pathlib import Path Path(analysis_folder).mkdir(parents=True, exist_ok=True) except Exception as e: print(e) count = 0 for capsule in alpha_IJ_scores: # print(val_batch_data["batch_node_indexes"]) connection_strength = capsule all_tuples = [] for i, node_index in enumerate(node_indexes): tuple_of_info = [] tuple_of_info.append(str(node_indexes[i])) tuple_of_info.append(str(node_types[i])) tuple_of_info.append(str(connection_strength[i])) tuple_of_info.append(node_tokens_text[i]) tuple_of_info = tuple(tuple_of_info) # print(tuple_of_info) all_tuples.append(tuple_of_info) all_tuples = sorted(all_tuples, key=lambda x: x[2]) all_tuples.reverse() with open(os.path.join(analysis_folder, "Group_" + str(count) + ".txt"), "w") as f: for t in all_tuples: line = ";".join(list(t)) f.write(line) f.write("\n") with open(os.path.join(analysis_folder, "result.txt"), "w") as f1: f1.write("Predicted : " + str(predicted_labels[0])) f1.write("\n") f1.write("Ground truth : " + str(ground_truth_labels[0])) f1.write("\n") import shutil try: print("Trying to copy original source file....") shutil.copy(file_path, analysis_folder) except Exception as e: print(e) count += 1
def run_epoch(self, epoch_name: str, epoch_num, data, is_training: bool): loss = 0 mean_edge_loss = 0 mean_node_loss = 0 mean_kl_loss = 0 mean_qed_loss = 0 node_loss_error = -10000000 node_pred_error = 0 start_time = time.time() processed_graphs = 0 if is_training and self.params['num_teacher_forcing'] >= epoch_num: teacher_forcing = True else: teacher_forcing = False batch_iterator = ThreadedIterator( self.make_minibatch_iterator(data, is_training), max_queue_size=self.params['batch_size'] ) # self.params['batch_size']) for step, batch_data in enumerate(batch_iterator): num_graphs = batch_data[self.placeholders['num_graphs']] processed_graphs += num_graphs batch_data[self.placeholders['is_generative']] = False batch_data[self.placeholders[ 'use_teacher_forcing_nodes']] = teacher_forcing batch_data[ self.placeholders['z_prior']] = utils.generate_std_normal( self.params['batch_size'], batch_data[self.placeholders['num_vertices']], self.params['hidden_size_encoder']) if is_training: batch_data[self.placeholders[ 'out_layer_dropout_keep_prob']] = self.params[ 'out_layer_dropout_keep_prob'] fetch_list = [ self.ops['loss'], self.ops['train_step'], self.ops["edge_loss"], self.ops['kl_loss'], self.ops['node_symbol_prob'], self.placeholders['node_symbols'], self.ops['qed_computed_values'], self.placeholders['target_values'], self.ops['total_qed_loss'], self.ops['mean'], self.ops['logvariance'], self.ops['grads'], self.ops['mean_edge_loss'], self.ops['mean_node_symbol_loss'], self.ops['mean_kl_loss'], self.ops['mean_total_qed_loss'], self.ops['grads2'], self.ops['node_loss_error'], self.ops['node_pred_error'] ] else: batch_data[ self.placeholders['out_layer_dropout_keep_prob']] = 1.0 fetch_list = [ self.ops['loss'], self.ops['mean_edge_loss'], self.ops['mean_node_symbol_loss'], self.ops['mean_kl_loss'], self.ops['mean_total_qed_loss'], self.ops['sampled_atoms'], self.ops['node_loss_error'], self.ops['node_pred_error'] ] result = self.sess.run(fetch_list, feed_dict=batch_data) batch_loss = result[0] loss += batch_loss * num_graphs if is_training: mean_edge_loss += result[12] * num_graphs mean_node_loss += result[13] * num_graphs mean_kl_loss += result[14] * num_graphs mean_qed_loss += result[15] * num_graphs node_loss_error = max(node_loss_error, np.max(result[17])) node_pred_error += result[18] else: mean_edge_loss += result[1] * num_graphs mean_node_loss += result[2] * num_graphs mean_kl_loss += result[3] * num_graphs mean_qed_loss += result[4] * num_graphs node_loss_error = max(node_loss_error, np.max(result[6])) node_pred_error += result[7] print( "Running %s, batch %i (has %i graphs). Total loss: %.4f. Edge loss: %.4f. Node loss: %.4f. KL loss: %.4f. Property loss: %.4f. Node error: %.4f. Node pred: %.4f." % (epoch_name, step, num_graphs, loss / processed_graphs, mean_edge_loss / processed_graphs, mean_node_loss / processed_graphs, mean_kl_loss / processed_graphs, mean_qed_loss / processed_graphs, node_loss_error, node_pred_error / processed_graphs), end='\r') mean_edge_loss /= processed_graphs mean_node_loss /= processed_graphs mean_kl_loss /= processed_graphs mean_qed_loss /= processed_graphs loss = loss / processed_graphs instance_per_sec = processed_graphs / (time.time() - start_time) return loss, mean_edge_loss, mean_node_loss, mean_kl_loss, mean_qed_loss, instance_per_sec
def main(opt): from pathlib import Path mis_prediction_path = os.path.join("mis_prediction", opt.transformation) Path(mis_prediction_path).mkdir(parents=True, exist_ok=True) opt.model_path = os.path.join(opt.model_path, form_model_path(opt)) checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(opt.model_path) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model : " + str(checkfile)) train_label_lookup, node_type_lookup, node_token_lookup, val_label_lookup = load_vocabs( opt) opt.label_lookup = train_label_lookup opt.num_labels = len(train_label_lookup.keys()) opt.node_type_lookup = node_type_lookup opt.node_token_lookup = node_token_lookup if opt.task == 1: train_dataset = MethodNamePredictionData(opt, opt.train_path, True, False, False) val_opt = copy.deepcopy(opt) val_opt.label_lookup = val_label_lookup val_opt.num_labels = len(val_label_lookup.keys()) val_opt.node_token_lookup = node_token_lookup validation_dataset = MethodNamePredictionData(val_opt, opt.val_path, False, False, True) ggnn = DenseGGNNModel(opt) # For debugging purpose nodes_representation = ggnn.nodes_representation graph_representation = ggnn.graph_representation logits = ggnn.logits label_embeddings = ggnn.label_embeddings softmax_values = ggnn.softmax_values attention_scores = ggnn.attention_scores loss_node = ggnn.loss optimizer = tf.compat.v1.train.AdamOptimizer(opt.lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): training_point = optimizer.minimize(loss_node) saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) init = tf.global_variables_initializer() best_f1_score = get_best_f1_score(opt) print("Best f1 score : " + str(best_f1_score)) with tf.Session() as sess: sess.run(init) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) print("Testing model.............") average_f1 = 0.0 validation_batch_iterator = ThreadedIterator( validation_dataset.make_minibatch_iterator(), max_queue_size=5) all_predicted_labels = [] all_ground_truth_labels = [] all_paths = [] for val_step, val_batch_data in enumerate( validation_batch_iterator): print("----------------------------------------") label_embeddings_matrix, scores = sess.run( [label_embeddings, logits], feed_dict={ ggnn.placeholders["num_vertices"]: val_batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: val_batch_data['adjacency_matrix'], ggnn.placeholders["node_type_indices"]: val_batch_data["node_type_indices"], ggnn.placeholders["node_token_indices"]: val_batch_data["node_token_indices"], ggnn.placeholders["is_training"]: False }) predictions = np.argmax(scores, axis=1) ground_truths = np.argmax(val_batch_data['labels'], axis=1) predicted_labels = [] for prediction in predictions: predicted_labels.append( train_label_lookup.inverse[prediction]) ground_truth_labels = [] for ground_truth in ground_truths: ground_truth_labels.append( val_label_lookup.inverse[ground_truth]) # all_predicted_labels.extend(predicted_labels) # all_ground_truth_labels.extend(ground_truth_labels) for i, file_path in enumerate(val_batch_data["paths"]): ground_truth = ground_truth_labels[i] predicted = predicted_labels[i] with open(mis_prediction_path, "a") as f10: line = file_path + "," + ground_truth + "," + predicted f10.write(line) f10.write("\n")
def main(opt): train_label_lookup = {} train_label_lookup_by_index = {} train_node_type_lookup = {} train_node_token_lookup = {} val_label_lookup = {} val_label_lookup_by_index = {} val_node_type_lookup = {} val_node_token_lookup = {} node_type_vocabulary_path = "preprocessed_data/node_type_vocab.txt" train_label_vocabulary_path = "preprocessed_data/train_label_vocab.txt" train_token_vocabulary_path = "preprocessed_data/train_token_vocab.txt" val_label_vocabulary_path = "preprocessed_data/val_label_vocab.txt" val_token_vocabulary_path = "preprocessed_data/val_token_vocab.txt" with open(train_label_vocabulary_path, "r") as f1: data = f1.readlines() for line in data: splits = line.replace("\n","").split(",") train_label_lookup[splits[1]] = int(splits[0]) train_label_lookup_by_index[int(splits[0])] = splits[1] with open(node_type_vocabulary_path, "r") as f2: data = f2.readlines() for line in data: splits = line.replace("\n","").split(",") train_node_type_lookup[splits[1]] = int(splits[0]) with open(train_token_vocabulary_path, "r") as f3: data = f3.readlines() for line in data: splits = line.replace("\n","").split(",") train_node_token_lookup[splits[1]] = int(splits[0]) with open(val_label_vocabulary_path, "r") as f4: data = f4.readlines() for line in data: splits = line.replace("\n","").split(",") val_label_lookup[splits[1]] = int(splits[0]) val_label_lookup_by_index[int(splits[0])] = splits[1] with open(val_token_vocabulary_path, "r") as f5: data = f5.readlines() for line in data: splits = line.replace("\n","").split(",") val_node_token_lookup[splits[1]] = int(splits[0]) train_node_token_lookup["captain_america"] = len(train_node_token_lookup.keys()) val_node_token_lookup["captain_america"] = len(val_node_token_lookup.keys()) checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(opt.model_path) # print(train_label_lookup) opt.label_lookup = train_label_lookup opt.num_labels = len(train_label_lookup.keys()) opt.node_type_lookup = train_node_type_lookup opt.node_token_lookup = train_node_token_lookup opt.path = "sample_data/java-small-graph/training" train_dataset = MethodNamePredictionData(opt, True, False, False) opt.n_edge_types = train_dataset.n_edge_types val_opt = copy.deepcopy(opt) val_opt.path = "sample_data/java-small-graph/validation" val_opt.label_lookup = val_label_lookup val_opt.num_labels = len(val_label_lookup.keys()) val_opt.node_token_lookup = val_node_token_lookup validation_dataset = MethodNamePredictionData(val_opt, False, False, True) ggnn = DenseGGNNModel(opt) # For debugging purpose nodes_representation = ggnn.nodes_representation graph_representation = ggnn.graph_representation logits = ggnn.logits softmax_values = ggnn.softmax_values attention_scores = ggnn.attention_scores loss_node = ggnn.loss optimizer = tf.train.AdamOptimizer(opt.lr) training_point = optimizer.minimize(loss_node) saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) for epoch in range(1, opt.epochs + 1): train_batch_iterator = ThreadedIterator(train_dataset.make_minibatch_iterator(), max_queue_size=1) for train_step, train_batch_data in enumerate(train_batch_iterator): _ , err, softmax_values_data, attention_scores_data = sess.run( [training_point, loss_node, softmax_values, attention_scores], feed_dict={ ggnn.placeholders["num_vertices"]: train_batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: train_batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: train_batch_data['labels'], ggnn.placeholders["node_type_indices"]: train_batch_data["node_type_indices"], ggnn.placeholders["node_token_indices"]: train_batch_data["node_token_indices"], ggnn.placeholders["graph_state_keep_prob"]: 0.5, ggnn.placeholders["edge_weight_dropout_keep_prob"]: 0.5 } ) print("Epoch:", epoch, "Step:",train_step,"Loss:", err) if train_step % opt.checkpoint_every == 0: #-------------------------------------- print("Validating.......") # predictions = [] validation_batch_iterator = ThreadedIterator(validation_dataset.make_minibatch_iterator(), max_queue_size=5) for _, val_batch_data in enumerate(validation_batch_iterator): # Note: putting ggnn.placeholders["labels"]: train_batch_data['labels'] seems stupid but it is a work-around, num labels in train data vs validation data is different softmax_values_data = sess.run( [softmax_values], feed_dict={ ggnn.placeholders["num_vertices"]: val_batch_data["num_vertices"], ggnn.placeholders["adjacency_matrix"]: val_batch_data['adjacency_matrix'], ggnn.placeholders["labels"]: train_batch_data['labels'], ggnn.placeholders["node_type_indices"]: val_batch_data["node_type_indices"], ggnn.placeholders["node_token_indices"]: val_batch_data["node_token_indices"], ggnn.placeholders["graph_state_keep_prob"]: 1.0, ggnn.placeholders["edge_weight_dropout_keep_prob"]: 1.0 } ) predictions = np.argmax(softmax_values_data[0],axis=1) ground_truths = np.argmax(val_batch_data['labels'],axis=1) # print(ground_truths) predicted_labels = [] for prediction in predictions: predicted_labels.append(train_label_lookup_by_index[prediction]) ground_truth_labels = [] for ground_truth in ground_truths: ground_truth_labels.append(val_label_lookup_by_index[ground_truth]) predicted_labels = transform_data(predicted_labels) ground_truth_labels = transform_data(ground_truth_labels) print("----------") print("Predicted: " + str(predicted_labels)) print("Ground truth: " + str(ground_truth_labels))