def test_model(test_trees, labels, embeddings, embedding_lookup, opt): logdir = opt.model_path batch_size = opt.train_batch_size epochs = opt.niter num_feats = len(embeddings[0]) random.shuffle(test_trees) # build the inputs and outputs of the network nodes_node, children_node, codecaps_node = network.init_net_treecaps( num_feats, len(labels)) out_node = network.out_layer(codecaps_node) labels_node, loss_node = network.loss_layer(codecaps_node, len(labels)) optimizer = RAdamOptimizer(opt.lr) train_step = optimizer.minimize(loss_node) sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) checkfile = os.path.join(logdir, 'tree_network.ckpt') correct_labels = [] predictions = [] print('Computing training accuracy...') for batch in sampling.batch_samples( sampling.gen_samples(test_trees, labels, embeddings, embedding_lookup), 1): nodes, children, batch_labels = batch output = sess.run([out_node], feed_dict={ nodes_node: nodes, children_node: children, }) correct_labels.append(np.argmax(batch_labels)) predictions.append(np.argmax(output)) target_names = list(labels) print( classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions)) print('*' * 50) print('Accuracy:', accuracy_score(correct_labels, predictions)) print('*' * 50)
def train_model(train_dataloader, val_dataloader, embeddings, embedding_lookup, opt): logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) train_left_trees = train_dataloader.left_trees train_right_trees = train_dataloader.right_trees train_labels = train_dataloader.labels val_left_trees = val_dataloader.left_trees val_right_trees = val_dataloader.right_trees val_labels = val_dataloader.labels n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention" : tf.Variable(initializer([opt.feature_size,1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([opt.feature_size,]), name="b_conv"), } left_nodes_node, left_children_node, right_nodes_node, right_children_node, hidden_node, left_score_node, right_score_node = network.init_net_for_siamese( num_feats, opt.feature_size, weights, biases, opt.aggregation, opt.distributed_function ) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin training....") # with tf.device(device): for epoch in range(1, epochs+1): print("----------------------------------------------------") for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(train_left_trees, train_right_trees, train_labels, embeddings, embedding_lookup, opt.train_batch_size,opt.batch_type): if opt.batch_type == "original": left_nodes, left_children = batch_left_trees right_nodes, right_children = batch_right_trees else: left_nodes, left_children, _ = batch_left_trees right_nodes, right_children, _ = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) _, err, out = sess.run( [train_step, loss_node, out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot } ) # print "hidden : " + str(loss) print('Epoch:', epoch,'Steps:', steps,'Loss:', err) if steps % CHECKPOINT_EVERY == 0: # save state so we can resume later saver.save(sess, os.path.join(checkfile), steps) print('Checkpoint saved.') steps+=1 steps = 0 correct_labels = [] predictions = [] for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(val_left_trees, val_right_trees, val_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): if opt.batch_type == "original": left_nodes, left_children = batch_left_trees right_nodes, right_children = batch_right_trees else: left_nodes, left_children, _ = batch_left_trees right_nodes, right_children, _ = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) output = sess.run( [out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot } ) correct = np.argmax(labels_one_hot, axis=1) predicted = np.argmax(output[0], axis=1) correct_labels.extend(correct) predictions.extend(predicted) print('Accuracy:', accuracy_score(correct_labels, predictions)) print(classification_report(correct_labels, predictions)) print(confusion_matrix(correct_labels, predictions))
def main(opt): target_directory = "live_test/github_java/sort_function/" file_name = aggregation_name + "_" + distributed_function_name + "_function.csv" print("Loading embeddings....") with open(opt.embeddings_directory, 'rb') as fh: embeddings, embed_lookup = pickle.load(fh,encoding='latin1') labels = [str(i) for i in range(1, opt.n_classes+1)] logdir = opt.model_path batch_size = opt.test_batch_size epochs = opt.niter node_embedding_size = len(embeddings[0]) # Loading program file # test_trees, node_ids, node_types, subtree_ids, pkl_path = load_program(opt) # Init model checkfile = os.path.join(logdir, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(logdir) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention" : tf.Variable(initializer([opt.feature_size,1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([opt.feature_size,]), name="b_conv"), } nodes_node, children_node, hidden_node, attention_score_node = network.init_net( node_embedding_size, len(labels), opt.feature_size, weights, biases, opt.aggregation, opt.distributed_function ) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, len(labels)) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() logdir = opt.model_path batch_size = opt.test_batch_size epochs = opt.niter node_embedding_size = len(embeddings[0]) with tf.Session() as sess: sess.run(init) ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) test_trees, node_ids, node_types, pkl_path, pb_path = load_program(opt.test_file) attention_score_scaled_map = predict(sess, out_node, attention_score_node, nodes_node, children_node, pkl_path, pb_path, test_trees, labels, node_ids, node_types, embeddings, embed_lookup) attention_path = os.path.join(opt.test_file.split(".")[0] + ".csv") if os.path.exists(attention_path): os.remove(attention_path) with open(attention_path,"a") as f: for k, v in attention_score_scaled_map.items(): f.write(str(k) + "," + str(v)) f.write("\n") generate_visualization(pb_path, attention_path)
def train_model(embeddings): dictt = {} listrec = [] file_list = os.listdir('dataset/features/features2') z = 0 for file in file_list: file_path = 'dataset/features/features2' + '/' + file if not os.path.exists(file_path): listrec.append(file) continue with open(file_path, 'r', encoding="utf-8") as faa: sample = json.loads(faa.read()) if sample == "" or sample == " " or sample == "\n": z += 1 listrec.append(file) continue dictt[file] = sample print("length of feature: " + str(len(dictt))) # print("invalid file number:", z) TrainDatalist = [] file = open("dataset/dataset/train/" + data_type + ".txt", 'r') for line in file: if line == "" or line == " ": continue TrainDatalist.append(line) print('len ( TrainData ) = ', len(TrainDatalist)) file.close() os.environ["CUDA_VISIBLE_DEVICES"] = "0" nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_finetune( feature_size, embeddingg, KERNEL) labels_node, loss_node = network.loss_layer(res) aaa = 0 global_step = tf.Variable(0, trainable=False) learn_rate = tf.train.exponential_decay(LEARN_RATE, global_step, len(TrainDatalist), 0.9, True) optimizer = tf.train.GradientDescentOptimizer(learn_rate) train_step = optimizer.minimize(loss_node, global_step=global_step) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session( config=config) # config=tf.ConfigProto(device_count={'GPU':0})) sess.run(tf.global_variables_initializer()) for global_step in range(1, EPOCHS + 1): k = 0 random.shuffle(TrainDatalist) for line in TrainDatalist: line = line.rstrip('\n') train_data = line.split('\t') if len(train_data) != 3: continue k += 1 if (train_data[0] in listrec) or (train_data[1] in listrec): continue nodes11, children1, nodes22, children2, batch_labels = getData_id_type( train_data, dictt, embeddings) _, err, r = sess.run( [train_step, loss_node, res], feed_dict={ nodes_node1: nodes11, children_node1: children1, nodes_node2: nodes22, children_node2: children2, labels_node: batch_labels, }) learn_rate_var = sess.run(learn_rate) aaa += 1 test_list = ['argouml', 'gwt', 'jruby', 'xstream', 'all'] print('------------Model 4' + data_type + '------------') for name in test_list: print('start test: ' + name) correct_labels_test = [] predictions_test = [] for _ in range(0, 20): predictions_test.append([]) ff = open("dataset/dataset/test/" + name + "/" + data_type + ".txt", 'r') line = "123" k = 0 while line: line = ff.readline().rstrip('\n') test_data = line.split('\t') if len(test_data) != 3: continue if (test_data[0] in listrec) or (test_data[1] in listrec): continue nodes11, children1, nodes22, children2, _ = getData_id_type( test_data, dictt, embeddings) label = test_data[2] k += 1 output = sess.run( [res], feed_dict={ nodes_node1: nodes11, children_node1: children1, nodes_node2: nodes22, children_node2: children2, }) correct_labels_test.append(int(label)) threaholder = -1.0 for i in range(0, 20): if output[0] >= threaholder: predictions_test[i].append(1) else: predictions_test[i].append(-1) threaholder += 0.1 with open( "dataset/cluster/4/" + name + "/" + test_data[0].split('_')[0] + '_' + test_data[1].split('_')[0] + '.txt', 'a') as fout: fout.writelines(test_data[2] + '\t' + test_data[0] + '\t' + test_data[1] + '\t' + str(output[0]) + '\n') # The choice of the threshold will not affect the clustering results # We just investigate the max accuracy of our model's prediction of the relationship between chunks maxstep = 0 maxaccuracy = 0 for i in range(0, 20): accuracy = accuracy_score(correct_labels_test, predictions_test[i]) if accuracy > maxaccuracy: maxaccuracy = accuracy maxstep = i threaholder = -1.0 + maxstep * 0.1 cm = confusion_matrix(correct_labels_test, predictions_test[maxstep], labels=[-1, 1]) tn, fp, fn, tp = cm.ravel() accuracy = accuracy_score(correct_labels_test, predictions_test[maxstep]) print("threaholder: " + str(threaholder)) print("combine precision_test:", tp / (tp + fp)) print("combine recall_test:", tp / (tp + fn)) print("seperate precision_test:", tn / (tn + fn)) print("seperate recall_test:", tn / (tn + fp)) print("accuracy_test:" + str(accuracy)) print("tp:" + str(tp) + " tn:" + str(tn) + " fp:" + str(fp) + " fn:" + str(fn)) # print(learn_rate_var) if name == "all": with open("dataset/cluster/4/total_result.txt", 'a') as total_result: total_result.writelines( str(tp) + '\t' + str(tn) + '\t' + str(fp) + '\t' + str(fn) + '\n') ff.close() print('---------------------------------------------')
def test_model(test_dataloader, embeddings, embedding_lookup, opt): logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) test_left_trees = test_dataloader.left_trees test_right_trees = test_dataloader.right_trees test_labels = test_dataloader.labels n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention": tf.Variable(initializer([opt.feature_size, 1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([ opt.feature_size, ]), name="b_conv"), } left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_2( num_feats, opt.feature_size, weights, biases, ) out_node = network.out_layer(logits_node) labels_node, loss_node = network.loss_layer(logits_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin computing accuracy....") correct_labels = [] predictions = [] for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( test_left_trees, test_right_trees, test_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): left_nodes, left_children, left_masks = batch_left_trees right_nodes, right_children, right_masks = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) matching_matrices, output = sess.run( [attention_matrix_nodes, out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot, left_mask_nodes: left_masks, right_mask_nodes: right_masks, }) for i, matrix in enumerate(matching_matrices): np.savetxt("matching_matrix/" + str(i) + ".csv", matrix, delimiter=",") print(output) print(output.shape) correct = np.argmax(labels_one_hot, axis=1) predicted = np.argmax(output, axis=1) correct_labels.extend(correct) predictions.extend(predicted) print('Accuracy:', accuracy_score(correct_labels, predictions)) print(classification_report(correct_labels, predictions)) print(confusion_matrix(correct_labels, predictions))
def train_model(logdir, infile, embedfile, epochs=EPOCHS, training="True", testing="True"): """Train a classifier to label ASTs""" print("Loading trees...") with open(infile, 'rb') as fh: trees, test_trees, labels = pickle.load(fh) random.shuffle(trees) print(labels) print("Loading embeddings....") with open(embedfile, 'rb') as fh: embeddings, embed_lookup = pickle.load(fh) num_feats = len(embeddings[0]) # build the inputs and outputs of the network nodes_node, children_node, hidden_node = network.init_net( num_feats, len(labels)) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, len(labels)) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) tf.summary.scalar('loss', loss_node) ### init the graph sess = tf.Session() #config=tf.ConfigProto(device_count={'GPU':0})) sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() summaries = tf.summary.merge_all() writer = tf.summary.FileWriter(logdir, sess.graph) ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) checkfile = os.path.join(logdir, 'cnn_tree.ckpt') if training == "True": print("Begin training..........") num_batches = len(trees) // BATCH_SIZE + ( 1 if len(trees) % BATCH_SIZE != 0 else 0) for epoch in range(1, epochs + 1): for i, batch in enumerate( sampling.batch_samples( sampling.gen_samples(trees, labels, embeddings, embed_lookup), BATCH_SIZE)): nodes, children, batch_labels = batch step = (epoch - 1) * num_batches + i * BATCH_SIZE if not nodes: continue # don't try to train on an empty batch # print(batch_labels) _, summary, err, out = sess.run( [train_step, summaries, loss_node, out_node], feed_dict={ nodes_node: nodes, children_node: children, labels_node: batch_labels }) print('Epoch:', epoch, 'Step:', step, 'Loss:', err, 'Max nodes:', len(nodes[0])) writer.add_summary(summary, step) if step % CHECKPOINT_EVERY == 0: # save state so we can resume later saver.save(sess, os.path.join(checkfile), step) print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' + str(step) + ', loss: ' + str(err) + '.') saver.save(sess, os.path.join(checkfile), step) # compute the training accuracy if testing == "True": correct_labels = [] predictions = [] print('Computing training accuracy...') for batch in sampling.batch_samples( sampling.gen_samples(test_trees, labels, embeddings, embed_lookup), 1): nodes, children, batch_labels = batch output = sess.run([out_node], feed_dict={ nodes_node: nodes, children_node: children, }) #print(output) correct_labels.append(np.argmax(batch_labels)) predictions.append(np.argmax(output)) target_names = list(labels) print('Accuracy:', accuracy_score(correct_labels, predictions)) print( classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions))
def main(): print("Loading embeddings....") with open(opt.embeddings_directory, 'rb') as fh: embeddings, embed_lookup = pickle.load(fh, encoding='latin1') logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention": tf.Variable(initializer([opt.feature_size, 1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([ opt.feature_size, ]), name="b_conv"), } left_nodes_node, left_children_node, right_nodes_node, right_children_node, hidden_node, left_score_node, right_score_node = network.init_net_for_siamese( num_feats, opt.feature_size, weights, biases, opt.aggregation, opt.distributed_function) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin computing accuracy....") test_file = opt.test_data # with open(test_file, "r") as f: # data = f.readlines() # for line in data: # line = line.replace("\n","") # test_dataloader = CrossLanguageProgramDataForLiveTest(test_file, 1,opt.n_classes) # test_left_trees = test_dataloader.left_trees # test_right_trees = test_dataloader.right_trees # test_labels = test_dataloader.labels all_pairs_index = [] with open(opt.test_data, "r") as f: data = f.readlines() for line in data: print(line) all_pairs_index.append(line.replace("\n", "")) for i, pair in tqdm(enumerate(all_pairs_index)): splits = pair.split(",") left_path = splits[0] right_path = splits[1] label = splits[2] pairs, left_node_ids_list, right_node_ids_list = load_single_pair_for_live_test( left_path, right_path, label) predict(sess, i, left_path, right_path, pairs, left_node_ids_list, right_node_ids_list, out_node, left_score_node, right_score_node, left_nodes_node, left_children_node, right_nodes_node, right_children_node, labels_node, embeddings, embed_lookup, opt)
def train_model(logdir, inputs, embeddings_list_url, node_map_url, epochs=EPOCHS, with_drop_out=1, device="0"): os.environ['CUDA_VISIBLE_DEVICES'] = device print("Using device : " + device) print("Batch size : " + str(BATCH_SIZE)) if int(with_drop_out) == 1: print("Training with drop out rate : " + str(DROP_OUT)) n_classess = 2 print("Loading training data....") # print "Using device : " + device with open(inputs, "rb") as fh: # all_1_pairs, all_0_pairs = pickle.load(fh) all_training_pairs = pickle.load(fh) random.shuffle(all_training_pairs) print("Loading embedding list.....") with open(embeddings_list_url, "rb") as embeddings_list_fh: embeddings_list = pickle.load(embeddings_list_fh) num_feats = len(embeddings_list[0]) print("number of features : " + str(num_feats)) print("Loading node map for looking up.....") with open(node_map_url, "rb") as node_map_fh: # all_1_pairs, all_0_pairs = pickle.load(fh) node_map = pickle.load(node_map_fh) # build the inputs and outputs of the network left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese( num_feats) right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese( num_feats) # with tf.device(device): print("Left pooling shape : " + str(tf.shape(left_pooling_node))) print("Right pooling shape : " + str(tf.shape(right_pooling_node))) merge_node = tf.concat([left_pooling_node, right_pooling_node], -1) print(tf.shape(merge_node)) hidden_node = network.hidden_layer(merge_node, 600, 300) if int(with_drop_out) == 1: hidden_node = tf.layers.dropout(hidden_node, rate=DROP_OUT, training=True) hidden_node = network.hidden_layer(hidden_node, 300, 100) if int(with_drop_out) == 1: hidden_node = tf.layers.dropout(hidden_node, rate=DROP_OUT, training=True) hidden_node = network.hidden_layer(hidden_node, 100, n_classess) if int(with_drop_out) == 1: hidden_node = tf.layers.dropout(hidden_node, rate=DROP_OUT, training=True) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) # tf.summary.scalar('loss', loss_node) # correct_prediction = tf.equal(tf.argmax(out_node,1), tf.argmax(labels_node,1)) # accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) ### init the graph # config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.allocator_type = 'BFC' # config.gpu_options.per_process_gpu_memory_fraction = 0.9 # config = tf.ConfigProto() # config.gpu_options.allow_growth = True config = tf.ConfigProto() # config.gpu_options.allocator_type ='BFC' # config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.98 sess = tf.Session(config=config) # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() summaries = tf.summary.merge_all() writer = tf.summary.FileWriter(logdir, sess.graph) ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 using_vector_lookup_left = False if os.path.isfile("/input/config.json"): file_handler = open(config_file, 'r') contents = json.load(file_handler) using_vector_lookup_left = contents[ 'using_vector_lookup_left'] == "false" print("Begin training....") # with tf.device(device): for epoch in range(1, epochs + 1): # sample_1_pairs = random.sample(all_1_pairs,1000) # sample_0_pairs = random.sample(all_0_pairs,1000) # sample_training_pairs = random.sample(all_training_pairs,6400) shuffle_left_trees, shuffle_right_trees, labels = get_trees_from_pairs( all_training_pairs) print("Len left:", len(shuffle_left_trees), "Len right:", len(shuffle_right_trees)) for left_gen_batch, right_gen_batch, labels_batch in sampling.batch_random_samples_2_sides( shuffle_left_trees, shuffle_right_trees, embeddings_list, node_map, labels, BATCH_SIZE): print("----------------------------------------------------") print("Len of label batch : " + str(labels_batch)) left_nodes, left_children = left_gen_batch right_nodes, right_children = right_gen_batch sim_labels, sim_labels_num = get_one_hot_similarity_label( labels_batch) # print("sim labels : " + str(sim_labels)) _, err, out, merge, labs = sess.run( [train_step, loss_node, out_node, merge_node, labels_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: sim_labels }) # print "hidden : " + str(loss) print('Epoch:', epoch, 'Steps:', steps, 'Loss:', err, "True Label vs Predicted Label:", zip(labs, out)) # print('Epoch:', epoch,'Steps:', steps,'Loss:', err) if steps % CHECKPOINT_EVERY == 0: # save state so we can resume later saver.save(sess, os.path.join(checkfile), steps) print('Checkpoint saved.') steps += 1 steps = 0
def test_model(test_trees, labels, embeddings, embedding_lookup, opt): logdir = opt.model_path batch_size = opt.train_batch_size epochs = opt.niter node_embedding_size = len(embeddings[0]) random.shuffle(test_trees) checkfile = os.path.join(logdir, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(logdir) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention": tf.Variable(initializer([opt.feature_size, 1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([ opt.feature_size, ]), name="b_conv"), } nodes_node, children_node, hidden_node, attention_score_node = network.init_net( node_embedding_size, len(labels), opt.feature_size, weights, biases, opt.aggregation, opt.distributed_function) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, len(labels)) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) correct_labels = [] predictions = [] print('Computing training accuracy...') for batch in sampling.batch_samples( sampling.gen_samples(test_trees, labels, embeddings, embedding_lookup), 1): nodes, children, batch_labels = batch output = sess.run([out_node], feed_dict={ nodes_node: nodes, children_node: children, }) # print(attention_score[0]) # print(len(attention_score[0])) # print(output) correct_labels.append(np.argmax(batch_labels)) predictions.append(np.argmax(output)) target_names = list(labels) print('Accuracy:', accuracy_score(correct_labels, predictions)) print( classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions))
def test_model(test_dataloader, embeddings, embedding_lookup, opt): logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) test_left_trees = test_dataloader.left_trees test_right_trees = test_dataloader.right_trees test_labels = test_dataloader.labels test_left_node_ids_list = test_dataloader.left_node_ids_list test_right_node_ids_list = test_dataloader.right_node_ids_list print(test_left_node_ids_list) print("Num id left : " + str(len(test_left_node_ids_list[0]))) print("Num id right : " + str(len(test_right_node_ids_list[0]))) n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention": tf.Variable(initializer([opt.feature_size, 1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([ opt.feature_size, ]), name="b_conv"), } left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_2( num_feats, opt.feature_size, weights, biases, ) # left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_3( # num_feats, # opt.feature_size # ) out_node = network.out_layer(logits_node) labels_node, loss_node = network.loss_layer(logits_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin computing accuracy....") correct_labels = [] predictions = [] for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( test_left_trees, test_right_trees, test_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): left_nodes, left_children, left_masks = batch_left_trees right_nodes, right_children, right_masks = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) matching_matrices, output = sess.run( [attention_matrix_nodes, out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot, left_mask_nodes: left_masks, right_mask_nodes: right_masks, }) matrix = matching_matrices[0] if len(test_left_node_ids_list[0]) < len(test_right_node_ids_list[0]): matrix = matrix[:len(test_left_node_ids_list[0]), ...] else: matrix = matrix[..., :len(test_right_node_ids_list[0])] matrix_pd = pd.DataFrame(data=matrix, index=test_left_node_ids_list[0], columns=test_right_node_ids_list[0]) matrix_pd.to_csv("live_test/github_pairwise_java/sort/1_matrix.csv", sep=",") left_matrix_aggregate_idx = matrix_pd.idxmax(axis=1) left_matrix_aggregate_idx.to_csv( "live_test/github_pairwise_java/sort/left_aggregate_attention_idx.csv", sep=",") right_matrix_aggregate_idx = matrix_pd.idxmax(axis=0) right_matrix_aggregate_idx.to_csv( "live_test/github_pairwise_java/sort/right_aggregate_attention_idx.csv", sep=",") left_matrix_aggregate = matrix_pd.max(axis=1) left_matrix_aggregate.to_csv( "live_test/github_pairwise_java/sort/left_aggregate_attention.csv", sep=",") left_matrix_max_dict = left_matrix_aggregate.to_dict() right_matrix_aggregate = matrix_pd.max(axis=0) right_matrix_aggregate.to_csv( "live_test/github_pairwise_java/sort/right_aggregate_attention.csv", sep=",") right_matrix_max_dict = right_matrix_aggregate.to_dict() left_scaled_attention_map = scale_attention(left_matrix_max_dict) right_scaled_attention_map = scale_attention(right_matrix_max_dict) left_attention_path = "live_test/github_pairwise_java/sort/left_attention_scaled.csv" left_pb_path = "github_java_sort_function_pb/5/3.java.pb" left_normal_html_path = "live_test/github_pairwise_java/sort/left_normal.html" with open( "live_test/github_pairwise_java/sort/left_attention_scaled.csv", "w") as f1: for key, score in enumerate(left_scaled_attention_map): line = str(key) + "," + str(score) f1.write("%s\n" % line) normal_cmd = "docker run --rm -v $(pwd):/e -it yijun/fast -H 0 -t -x " + left_attention_path + " " + left_pb_path + " > " + left_normal_html_path print(normal_cmd) os.system(normal_cmd) right_attention_path = "live_test/github_pairwise_java/sort/right_attention_scaled.csv" right_pb_path = "github_java_sort_function_pb/5/105.java.pb" right_normal_html_path = "live_test/github_pairwise_java/sort/right_normal.html" with open( "live_test/github_pairwise_java/sort/right_attention_scaled.csv", "w") as f1: for key, score in enumerate(right_scaled_attention_map): line = str(key) + "," + str(score) f1.write("%s\n" % line) normal_cmd = "docker run --rm -v $(pwd):/e -it yijun/fast -H 0 -t -x " + right_attention_path + " " + right_pb_path + " > " + right_normal_html_path print(normal_cmd) os.system(normal_cmd) print(output) print(labels_one_hot) correct = np.argmax(labels_one_hot, axis=1) predicted = np.argmax(output, axis=1) correct_labels.extend(correct) predictions.extend(predicted) print('Accuracy:', accuracy_score(correct_labels, predictions)) print(classification_report(correct_labels, predictions)) print(confusion_matrix(correct_labels, predictions))
def train_model(train_trees, val_trees, labels, embeddings, embedding_lookup, opt): """Train a classifier to label ASTs""" logdir = opt.model_path batch_size = opt.train_batch_size epochs = opt.niter node_embedding_size = len(embeddings[0]) random.shuffle(train_trees) random.shuffle(val_trees) # random.shuffle(test_trees) checkfile = os.path.join(logdir, 'cnn_tree.ckpt') ckpt = tf.train.get_checkpoint_state(logdir) initializer = tf.contrib.layers.xavier_initializer() weights = { "w_t": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"), "w_l": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"), "w_r": tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"), "w_attention": tf.Variable(initializer([opt.feature_size, 1]), name="w_attention") } biases = { "b_conv": tf.Variable(initializer([ opt.feature_size, ]), name="b_conv"), } nodes_node, children_node, hidden_node, attention_score_node = network.init_net( node_embedding_size, len(labels), opt.feature_size, weights, biases, opt.aggregation, opt.distributed_function) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, len(labels)) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() if opt.training: print("Begin training..........") with tf.Session() as sess: sess.run(init) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") print("Checkpoint path : " + str(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # saved_model.loader.load(sess, [tag_constants.TRAINING], savedmodel_path) num_batches = len(train_trees) // batch_size + ( 1 if len(train_trees) % batch_size != 0 else 0) for epoch in range(1, epochs + 1): for i, batch in enumerate( sampling.batch_samples( sampling.gen_samples(train_trees, labels, embeddings, embedding_lookup), batch_size)): nodes, children, batch_labels = batch # print(len(batch_labels)) # print(len(batch_labels[0])) step = (epoch - 1) * num_batches + i * BATCH_SIZE if not nodes: continue # don't try to train on an empty batch # print(batch_labels) _, err, out = sess.run( [train_step, loss_node, out_node], feed_dict={ nodes_node: nodes, children_node: children, labels_node: batch_labels }) print('Epoch:', epoch, 'Step:', step, 'Loss:', err, 'Max nodes:', len(nodes[0])) if step % CHECKPOINT_EVERY == 0: # save state so we can resume later saver.save(sess, checkfile) # shutil.rmtree(savedmodel_path) print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' + str(step) + ', loss: ' + str(err) + '.') correct_labels = [] predictions = [] for batch in sampling.batch_samples( sampling.gen_samples(val_trees, labels, embeddings, embedding_lookup), 1): nodes, children, batch_labels = batch output = sess.run([out_node], feed_dict={ nodes_node: nodes, children_node: children, }) # print(output) correct_labels.append(np.argmax(batch_labels)) predictions.append(np.argmax(output)) target_names = list(labels) print('Accuracy:', accuracy_score(correct_labels, predictions)) print( classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions)) print("Finish all iters, storring the whole model..........") saver.save(sess, checkfile)
def test_model(logdir, inputs, embeddings_list_url, node_map_url, epochs=EPOCHS): """Train a classifier to label ASTs""" n_classess = 2 print("Loading embedding list.....") with open(embeddings_list_url, "rb") as embeddings_list_fh: embeddings_list = pickle.load(embeddings_list_fh) num_feats = len(embeddings_list[0]) print("number of features : " + str(num_feats)) print("Loading node map for looking up.....") with open(node_map_url, "rb") as node_map_fh: # all_1_pairs, all_0_pairs = pickle.load(fh) node_map = pickle.load(node_map_fh) # build the inputs and outputs of the network left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese( num_feats) right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese( num_feats) merge_node = tf.concat([left_pooling_node, right_pooling_node], -1) hidden_node = network.hidden_layer(merge_node, 600, 300) # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False) hidden_node = network.hidden_layer(hidden_node, 300, 100) # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False) hidden_node = network.hidden_layer(hidden_node, 100, n_classess) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) # tf.summary.scalar('loss', loss_node) ### init the graph sess = tf.Session() #config=tf.ConfigProto(device_count={'GPU':0})) sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 correct_labels = [] predictions = [] print('Computing testing accuracy...') with open( inputs, "rb", ) as csvfile: # with codecs.open("data/test.csv", "r", encoding = "utf-8", errors = 'replace') as csvfile: test_data_reader = csv.DictReader(csvfile, delimiter=',') for row in test_data_reader: print("----------------------") print(smart_str(row['test_id'])) print(smart_str(row['question1'])) print(smart_str(row['question2'])) try: left_tree, right_tree = get_trees(smart_str(row['question1']), smart_str(row['question2'])) left_nodes, left_children, right_nodes, right_children = sampling.patch_data( left_tree, right_tree, embeddings_list, node_map) # for left_nodes, left_children, right_nodes, right_children in sampling.patch_data(left_tree, right_tree, embeddings_list, node_map): # left_nodes, left_children = left_gen_batch # right_nodes, right_children = right_gen_batch output = sess.run( [out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children }) print(output) predicted = np.argmax(output[0]) print(predicted) with open("data/predict_proba2.csv", "a") as f2: f2.write(row['test_id'] + "," + str(format(output[0][0][1], "f")) + "\n") except Exception as e: print "Error : " + str(e) with open("data/predict_proba2.csv", "a") as f2: f2.write(row['test_id'] + "," + "0" + "\n")
def train_model(train_trees, val_trees, labels, embeddings, embedding_lookup, opt): max_acc = 0.0 logdir = opt.model_path batch_size = opt.train_batch_size epochs = opt.niter num_feats = len(embeddings[0]) random.shuffle(train_trees) nodes_node, children_node, codecaps_node = network.init_net_treecaps( num_feats, len(labels)) codecaps_node = tf.identity(codecaps_node, name="codecaps_node") out_node = network.out_layer(codecaps_node) labels_node, loss_node = network.loss_layer(codecaps_node, len(labels)) optimizer = RAdamOptimizer(opt.lr) train_step = optimizer.minimize(loss_node) ### init the graph sess = tf.Session() sess.run(tf.global_variables_initializer()) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) checkfile = os.path.join(logdir, 'tree_network.ckpt') print("Begin training..........") num_batches = len(train_trees) // batch_size + ( 1 if len(train_trees) % batch_size != 0 else 0) for epoch in range(1, epochs + 1): bar = progressbar.ProgressBar(maxval=len(train_trees), widgets=[ progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage() ]) bar.start() for i, batch in enumerate( sampling.batch_samples( sampling.gen_samples(train_trees, labels, embeddings, embedding_lookup), batch_size)): nodes, children, batch_labels = batch step = (epoch - 1) * num_batches + i * batch_size if not nodes: continue _, err, out = sess.run( [train_step, loss_node, out_node], feed_dict={ nodes_node: nodes, children_node: children, labels_node: batch_labels }) bar.update(i + 1) bar.finish() correct_labels = [] predictions = [] logits = [] for batch in sampling.batch_samples( sampling.gen_samples(val_trees, labels, embeddings, embedding_lookup), 1): nodes, children, batch_labels = batch output = sess.run([out_node], feed_dict={ nodes_node: nodes, children_node: children }) correct_labels.append(np.argmax(batch_labels)) predictions.append(np.argmax(output)) logits.append(output) target_names = list(labels) acc = accuracy_score(correct_labels, predictions) if (acc > max_acc): max_acc = acc saver.save(sess, checkfile) np.save(opt.model_path + '/logits', np.array(logits)) np.save(opt.model_path + '/correct', np.array(correct_labels)) print('Epoch', str(epoch), 'Accuracy:', acc, 'Max Acc: ', max_acc) csv_log.write(str(epoch) + ',' + str(acc) + ',' + str(max_acc) + '\n') print("Finish all iters, storring the whole model..........")
def train_model(train_trees, val_trees, labels, embeddings, embedding_lookup, opt): """Train a classifier to label ASTs""" logdir = opt.model_path batch_size = opt.train_batch_size epochs = opt.niter num_feats = len(embeddings[0]) random.shuffle(train_trees) random.shuffle(val_trees) nodes_node, children_node, hidden_node, attention_score_node = network.init_net( num_feats, len(labels), opt.aggregation) hidden_node = tf.identity(hidden_node, name="hidden_node") out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, len(labels)) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) ### init the graph sess = tf.Session() #config=tf.ConfigProto(device_count={'GPU':0})) sess.run(tf.global_variables_initializer()) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) checkfile = os.path.join(logdir, 'cnn_tree.ckpt') print("Begin training..........") num_batches = len(train_trees) // batch_size + ( 1 if len(train_trees) % batch_size != 0 else 0) for epoch in range(1, epochs + 1): for i, batch in enumerate( sampling.batch_samples( sampling.gen_samples(train_trees, labels, embeddings, embedding_lookup), batch_size)): nodes, children, batch_labels = batch step = (epoch - 1) * num_batches + i * BATCH_SIZE if not nodes: continue # don't try to train on an empty batch # print(batch_labels) _, err, out = sess.run( [train_step, loss_node, out_node], feed_dict={ nodes_node: nodes, children_node: children, labels_node: batch_labels }) print('Epoch:', epoch, 'Step:', step, 'Loss:', err, 'Max nodes:', len(nodes[0])) # print(attention_score[0]) # print(len(attention_score[0])) # print(pooling_output.shape) if step % CHECKPOINT_EVERY == 0: # save state so we can resume later saver.save(sess, checkfile) # shutil.rmtree(savedmodel_path) print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' + str(step) + ', loss: ' + str(err) + '.') correct_labels = [] predictions = [] for batch in sampling.batch_samples( sampling.gen_samples(val_trees, labels, embeddings, embedding_lookup), 1): nodes, children, batch_labels = batch output = sess.run([out_node], feed_dict={ nodes_node: nodes, children_node: children, }) # print(output) correct_labels.append(np.argmax(batch_labels)) predictions.append(np.argmax(output)) target_names = list(labels) print('Accuracy:', accuracy_score(correct_labels, predictions)) print( classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions)) print("Finish all iters, storring the whole model..........") saver.save(sess, checkfile)
def train_model(infile, embeddings, epochs=EPOCHS): os.environ["CUDA_VISIBLE_DEVICES"] = "1" num_feats = 100 nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_nofinetune( num_feats) labels_node, loss_node = network.loss_layer(res) optimizer = tf.train.GradientDescentOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session( config=config) #config=tf.ConfigProto(device_count={'GPU':0})) sess.run(tf.global_variables_initializer()) dictt = {} listrec = [] f = open("flistPOJ.txt", 'r') line = f.readline().rstrip('\t') l = line.split('\t') for ll in l: if not os.path.exists(ll): listrec.append(ll) continue tree = pycparser.parse_file(ll) sample, size = _traverse_tree_noast(tree) dictt[ll] = sample f.close() for epoch in range(1, epochs + 1): f = open(infile, 'r') line = "123" aaa = 0 while line: line = f.readline().rstrip('\n') l = line.split('\t') if len(l) != 3: break if l[0] in listrec: continue if l[1] in listrec: continue nodes1, children1, nodes2, children2, batch_labels = getData_nofinetune( l, dictt, embeddings) _, err, r = sess.run( [train_step, loss_node, res], feed_dict={ nodes_node1: nodes1, children_node1: children1, nodes_node2: nodes2, children_node2: children2, labels_node: [batch_labels] }) maxnodes = max(len(nodes1[0]), len(nodes2[0])) if aaa % 1000 == 0: print('Epoch:', epoch, 'Step:', aaa, 'Loss:', err, 'R:', r, 'Max nodes:', maxnodes) aaa += 1 f.close() correct_labels_dev = [] predictions_dev = [] for reci in range(0, 15): predictions_dev.append([]) ff = open("./datasetForVariantsTBCCD/POJ/devdata.txt", 'r') line = "123" k = 0 while line: line = ff.readline().rstrip('\n') l = line.split('\t') if len(l) != 3: break k += 1 nodes1, children1, nodes2, children2, batch_labels = getData_nofinetune( l, dictt, embeddings) output = sess.run( [res], feed_dict={ nodes_node1: nodes1, children_node1: children1, nodes_node2: nodes2, children_node2: children2, }) correct_labels_dev.append(int(l[2])) threaholder = -0.7 for i in range(0, 15): if output[0] >= threaholder: predictions_dev[i].append(1) else: predictions_dev[i].append(-1) threaholder += 0.1 maxf1value = -1.0 for i in range(0, 15): f1score = f1_score(correct_labels_dev, predictions_dev[i], average='binary') if f1score > maxf1value: maxf1value = f1score maxstep = i ff.close() correct_labels_test = [] predictions_test = [] ff = open("./datasetForVariantsTBCCD/POJ/testdata.txt", 'r') line = "123" k = 0 while line: line = ff.readline().rstrip('\n') l = line.split('\t') if len(l) != 3: break k += 1 if (l[0] in listrec) or (l[1] in listrec): continue nodes1, children1, nodes2, children2, batch_labels = getData_nofinetune( l, dictt, embeddings) output = sess.run( [res], feed_dict={ nodes_node1: nodes1, children_node1: children1, nodes_node2: nodes2, children_node2: children2, }) correct_labels_test.append(int(l[2])) threaholder = -0.7 + maxstep * 0.1 if output[0] >= threaholder: predictions_test.append(1) else: predictions_test.append(-1) ff.close() print("starttest:\n") print("threaholder:") print(threaholder) p = precision_score(correct_labels_test, predictions_test, average='binary') r = recall_score(correct_labels_test, predictions_test, average='binary') f1score = f1_score(correct_labels_test, predictions_test, average='binary') print("recall_test:" + str(r)) print("precision_test:" + str(p)) print("f1score_test:" + str(f1score))
def test_model(logdir, inputs, left_embedfile, right_embedfile, epochs=EPOCHS): """Train a classifier to label ASTs""" n_classess = 2 # left_algo_labels = ['mergesort', 'linkedlist', 'quicksort', 'bfs', 'bubblesort', 'knapsack'] # right_algo_labels = ['mergesort', 'linkedlist', 'quicksort', 'bfs', 'bubblesort', 'knapsack'] left_algo_labels = [ "bfs", "bubblesort", "knapsack", "linkedlist", "mergesort", "quicksort", "heap", "dfs", "stack", "queue" ] right_algo_labels = [ "bfs", "bubblesort", "knapsack", "linkedlist", "mergesort", "quicksort", "heap", "dfs", "stack", "queue" ] # with open(left_inputs, 'rb') as fh: # _, left_trees, left_algo_labels = pickle.load(fh) # with open(right_inputs, 'rb') as fh: # _, right_trees, right_algo_labels = pickle.load(fh) with open(inputs, "rb") as fh: testing_pairs = pickle.load(fh) print "Loading embdding vectors...." with open(left_embedfile, 'rb') as fh: left_embeddings, left_embed_lookup = pickle.load(fh) with open(right_embedfile, 'rb') as fh: right_embeddings, right_embed_lookup = pickle.load(fh) num_feats = len(left_embeddings[0]) # build the inputs and outputs of the network left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese( num_feats) right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese( num_feats) merge_node = tf.concat([left_pooling_node, right_pooling_node], -1) hidden_node = network.hidden_layer(merge_node, 200, 200) # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False) hidden_node = network.hidden_layer(hidden_node, 200, 200) # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False) hidden_node = network.hidden_layer(hidden_node, 200, n_classess) out_node = network.out_layer(hidden_node) labels_node, loss_node = network.loss_layer(hidden_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) # tf.summary.scalar('loss', loss_node) ### init the graph sess = tf.Session() #config=tf.ConfigProto(device_count={'GPU':0})) sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 left_trees, right_trees = get_trees_from_pairs(testing_pairs) using_vector_lookup_left = False if os.path.isfile("/input/config.json"): file_handler = open(config_file, 'r') contents = json.load(file_handler) using_vector_lookup_left = contents[ 'using_vector_lookup_left'] == "false" correct_labels = [] predictions = [] print('Computing testing accuracy...') for left_gen_batch, right_gen_batch in sampling.batch_random_samples_2_sides( left_trees, left_algo_labels, right_trees, right_algo_labels, left_embeddings, left_embed_lookup, right_embeddings, right_embed_lookup, using_vector_lookup_left, False, TEST_BATCH_SIZE): left_nodes, left_children, left_labels_one_hot, left_labels = left_gen_batch right_nodes, right_children, right_labels_one_hot, right_labels = right_gen_batch sim_labels, _ = get_one_hot_similarity_label(left_labels, right_labels) print("sim labels : " + str(sim_labels)) output = sess.run( [out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: sim_labels }) correct = np.argmax(sim_labels[0]) predicted = np.argmax(output[0]) check = (correct == predicted) and True or False print('Out:', output, "Status:", check) correct_labels.append(np.argmax(sim_labels[0])) predictions.append(np.argmax(output[0])) target_names = ["0", "1"] print('Accuracy:', accuracy_score(correct_labels, predictions)) print( classification_report(correct_labels, predictions, target_names=target_names)) print(confusion_matrix(correct_labels, predictions))
def train_model(train_dataloader, val_dataloader, embeddings, embedding_lookup, opt): logdir = opt.model_path epochs = opt.niter node_embedding_size = len(embeddings[0]) train_left_trees = train_dataloader.left_trees train_right_trees = train_dataloader.right_trees train_labels = train_dataloader.labels val_left_trees = val_dataloader.left_trees val_right_trees = val_dataloader.right_trees val_labels = val_dataloader.labels n_classess = 2 num_feats = len(embeddings[0]) initializer = tf.contrib.layers.xavier_initializer() left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_3( num_feats, opt.feature_size) out_node = network.out_layer(logits_node) labels_node, loss_node = network.loss_layer(logits_node, n_classess) optimizer = tf.train.AdamOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) sess = tf.Session() # sess = tf.Session() sess.run(tf.global_variables_initializer()) with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) # else: # raise 'Checkpoint not found.' checkfile = os.path.join(logdir, 'cnn_tree.ckpt') steps = 0 print("Begin training....") # with tf.device(device): # temp_precision = 0.0 # temp_recall = 0.0 temp_accuracy = 0.0 for epoch in range(1, epochs + 1): for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( train_left_trees, train_right_trees, train_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): left_nodes, left_children, left_masks = batch_left_trees right_nodes, right_children, right_masks = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) _, err, left_nodes_out, out = sess.run( [train_step, loss_node, left_nodes_node, out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot, left_mask_nodes: left_masks, right_mask_nodes: right_masks, }) print('Epoch:', epoch, 'Steps:', steps, 'Loss:', err, "Val Accuracy:", temp_accuracy) print(left_nodes_out.shape) if steps % CHECKPOINT_EVERY == 0: print("Checkpoint, validating.....") correct_labels = [] predictions = [] for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides( val_left_trees, val_right_trees, val_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type): left_nodes, left_children, left_masks = batch_left_trees right_nodes, right_children, right_masks = batch_right_trees labels_one_hot = convert_labels_to_one_hot(batch_labels) output = sess.run( [out_node], feed_dict={ left_nodes_node: left_nodes, left_children_node: left_children, right_nodes_node: right_nodes, right_children_node: right_children, labels_node: labels_one_hot, left_mask_nodes: left_masks, right_mask_nodes: right_masks, }) correct = np.argmax(labels_one_hot, axis=1) predicted = np.argmax(output[0], axis=1) correct_labels.extend(correct) predictions.extend(predicted) accuracy = float(accuracy_score(correct_labels, predictions)) precision = float(precision_score(correct_labels, predictions)) recall = float(recall_score(correct_labels, predictions)) print('Accuracy:', accuracy) print(classification_report(correct_labels, predictions)) print(confusion_matrix(correct_labels, predictions)) if accuracy > temp_accuracy: temp_accuracy = accuracy with open("no_tbcnn_validation.txt", "w") as f: f.write(str(temp_accuracy)) # save state so we can resume later saver.save(sess, os.path.join(checkfile), steps) print('Checkpoint saved.') steps += 1 steps = 0
def train_model(infile, embeddings): os.environ["CUDA_VISIBLE_DEVICES"] = "0" num_feats = len(getWordEmd('ForStatement')) nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_nofinetune( num_feats) labels_node, loss_node = network.loss_layer(res) optimizer = tf.train.GradientDescentOptimizer(LEARN_RATE) train_step = optimizer.minimize(loss_node) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session( config=config) # config=tf.ConfigProto(device_count={'GPU':0})) sess.run(tf.global_variables_initializer()) dictt = {} listrec = [] f = open("flistBCB.txt", 'r') line = f.readline().rstrip('\t') l = line.split('\t') z = 0 for ll in l: if not os.path.exists(ll): listrec.append(ll) continue faa = open(ll, 'r', encoding="utf-8") fff = faa.read() tree = javalang.parse.parse_member_signature(fff) sample, size = _traverse_treewithid(tree) if size > 3000 or size < 10: z += 1 listrec.append(ll) continue dictt[ll] = sample f.close() for epoch in range(1, EPOCHS + 1): f = open(infile, 'r') line = "123" k = 0 while line: line = f.readline().rstrip('\n') l = line.split('\t') if len(l) != 3: break k += 1 if (l[0] in listrec) or (l[1] in listrec): continue batch_labels = [] nodes1, children1, nodes2, children2, la = getData_nofinetune( l, dictt, embeddings) batch_labels.append(la) _, err, r = sess.run( [train_step, loss_node, res], feed_dict={ nodes_node1: nodes1, children_node1: children1, nodes_node2: nodes2, children_node2: children2, labels_node: batch_labels }) maxnodes = max(len(nodes1[0]), len(nodes2[0])) if k % 1000 == 0: print('Epoch:', epoch, 'Step:', k, 'Loss:', err, 'R:', r, 'Max nodes:', maxnodes) f.close() correct_labels_dev = [] predictions_dev = [] for reci in range(0, 15): predictions_dev.append([]) ff = open("./datasetForVariantsTBCCD/BCB/devdata.txt", 'r') line = "123" k = 0 while line: line = ff.readline().rstrip('\n') l = line.split('\t') if len(l) != 3: break if (l[0] in listrec) or (l[1] in listrec): continue batch_labels = [] nodes1, children1, nodes2, children2, la = getData_nofinetune( l, dictt, embeddings) batch_labels.append(la) k += 1 output = sess.run( [res], feed_dict={ nodes_node1: nodes1, children_node1: children1, nodes_node2: nodes2, children_node2: children2, }) correct_labels_dev.append(int(batch_labels[0])) threaholder = -0.7 for i in range(0, 15): if output[0] >= threaholder: predictions_dev[i].append(1) else: predictions_dev[i].append(-1) threaholder += 0.1 maxstep = 0 maxf1value = 0 for i in range(0, 15): f1score = f1_score(correct_labels_dev, predictions_dev[i], average='binary') if f1score > maxf1value: maxf1value = f1score maxstep = i ff.close() correct_labels_test = [] predictions_test = [] ff = open("./datasetForVariantsTBCCD/BCB/testdata.txt", 'r') line = "123" k = 0 print("starttest:") while line: line = ff.readline().rstrip('\n') l = line.split('\t') if len(l) != 3: break k += 1 if (l[0] in listrec) or (l[1] in listrec): continue batch_labels = [] nodes1, children1, nodes2, children2, la = getData_nofinetune( l, dictt, embeddings) batch_labels.append(la) output = sess.run( [res], feed_dict={ nodes_node1: nodes1, children_node1: children1, nodes_node2: nodes2, children_node2: children2, }) k += 1 correct_labels_test.append(int(batch_labels[0])) threaholderr = -0.7 + maxstep * 0.1 if output[0] >= threaholderr: predictions_test.append(1) else: predictions_test.append(-1) ff.close() print("testdata\n") print("threa:") print(threaholderr) p = precision_score(correct_labels_test, predictions_test, average='binary') r = recall_score(correct_labels_test, predictions_test, average='binary') f1score = f1_score(correct_labels_test, predictions_test, average='binary') print("recall_test:" + str(r)) print("precision_test:" + str(p)) print("f1score_test:" + str(f1score)) ff.close()
def train_model(train_trees, val_trees, labels, embedding_lookup, opt): max_acc = 0.0 logdir = opt.model_path batch_size = opt.train_batch_size epochs = opt.niter random.shuffle(train_trees) nodes_node, children_node, codecaps_node = network.init_net_treecaps(50, embedding_lookup, len(labels)) codecaps_node = tf.identity(codecaps_node, name="codecaps_node") out_node = network.out_layer(codecaps_node) labels_node, loss_node = network.loss_layer(codecaps_node, len(labels)) optimizer = RAdamOptimizer(opt.lr) train_point = optimizer.minimize(loss_node) ### init the graph sess = tf.Session() sess.run(tf.global_variables_initializer()) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() with tf.name_scope('saver'): saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: print("Continue training with old model") saver.restore(sess, ckpt.model_checkpoint_path) for i, var in enumerate(saver._var_list): print('Var {}: {}'.format(i, var)) checkfile = os.path.join(logdir, 'tree_network.ckpt') print("Begin training..........") num_batches = len(train_trees) // batch_size + (1 if len(train_trees) % batch_size != 0 else 0) max_acc = 0.0 for epoch in range(1, epochs+1): for train_step, train_batch in enumerate(sampling.batch_samples( sampling.gen_samples(train_trees, labels), batch_size )): nodes, children, batch_labels = train_batch # step = (epoch - 1) * num_batches + train_step * batch_size if not nodes: continue _, err, out = sess.run( [train_point, loss_node, out_node], feed_dict={ nodes_node: nodes, children_node: children, labels_node: batch_labels } ) print("Epoch : ", str(epoch), "Step : ", train_step, "Loss : ", err, "Max Acc: ",max_acc) if train_step % 1000 == 0 and train_step > 0: correct_labels = [] predictions = [] # logits = [] for test_batch in sampling.batch_samples( sampling.gen_samples(val_trees, labels), batch_size ): print("---------------") nodes, children, batch_labels = test_batch print(batch_labels) output = sess.run([out_node], feed_dict={ nodes_node: nodes, children_node: children } ) batch_correct_labels = np.argmax(batch_labels, axis=1) batch_predictions = np.argmax(output[0], axis=1) correct_labels.extend(batch_correct_labels) predictions.extend(batch_predictions) # logits.append(output) print(batch_correct_labels) print(batch_predictions) acc = accuracy_score(correct_labels, predictions) if (acc>max_acc): max_acc = acc saver.save(sess, checkfile) print("Saved checkpoint....") print('Epoch',str(epoch),'Accuracy:', acc, 'Max Acc: ',max_acc) csv_log.write(str(epoch)+','+str(acc)+','+str(max_acc)+'\n') print("Finish all iters, storring the whole model..........")