Ejemplo n.º 1
0
def making_prediction(test_dataset, ggnn, sess, opt, original=False):

    # For debugging purpose
    nodes_representation = ggnn.nodes_representation
    graph_representation = ggnn.graph_representation
    logits = ggnn.logits
    softmax_values = ggnn.softmax_values
    attention_scores = ggnn.attention_scores

    batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(),
                                      max_queue_size=5)

    correct_labels = []
    predictions = []

    attention_scores_data = []
    softmax_values_data = []
    print("--------------------------------------")
    print('Computing training accuracy...')

    for step, batch_data in enumerate(batch_iterator):
        # print(batch_data["labels"])

        print(batch_data['labels'])
        softmax_values_data, attention_scores_data = sess.run(
            [softmax_values, attention_scores],
            feed_dict={
                ggnn.placeholders["initial_node_representation"]:
                batch_data["initial_representations"],
                ggnn.placeholders["num_vertices"]:
                batch_data["num_vertices"],
                ggnn.placeholders["adjacency_matrix"]:
                batch_data['adjacency_matrix'],
                ggnn.placeholders["labels"]:
                batch_data['labels']
            })

        correct_labels.extend(np.argmax(batch_data['labels'], axis=1))
        argmax = np.argmax(softmax_values_data, axis=1)
        predictions.extend(np.argmax(softmax_values_data, axis=1))

        print("Probability : " + str(softmax_values_data))
        print("Probability max : " +
              str(np.argmax(softmax_values_data, axis=1)))
        print("Correct class " + str(correct_labels))
        print("Predicted class : " + str(predictions))

    scaled_attention_scores_path, raw_attention_scores_path, raw_attention_scores_dict = generate_attention_scores(
        opt.test_file, attention_scores_data[0])
    prediction_results = {}
    prediction_results[
        "scaled_attention_scores_path"] = scaled_attention_scores_path
    prediction_results["raw_attention_scores_path"] = raw_attention_scores_path
    prediction_results["raw_attention_scores_dict"] = raw_attention_scores_dict
    prediction_results["softmax_values_data"] = softmax_values_data
    prediction_results["predicted_label"] = argmax[0]
    prediction_results["correct_label"] = np.argmax(batch_data['labels'],
                                                    axis=1)

    return prediction_results
Ejemplo n.º 2
0
def making_prediction(test_dataset, ggnn, sess, opt):

     # For debugging purpose
    nodes_representation = ggnn.nodes_representation
    graph_representation = ggnn.graph_representation
    logits = ggnn.logits
    softmax_values = ggnn.softmax_values
    attention_scores = ggnn.attention_scores

    batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5)

    correct_labels = []
    predictions = []
    print("--------------------------------------")
    print('Computing training accuracy...')

    for step, batch_data in enumerate(batch_iterator):
        # print(batch_data["labels"])

        print(batch_data['labels'])
        softmax_values_data, attention_scores_data = sess.run(
            [softmax_values, attention_scores],
            feed_dict={
                ggnn.placeholders["initial_node_representation"]: batch_data["initial_representations"],
                ggnn.placeholders["num_vertices"]: batch_data["num_vertices"],
                ggnn.placeholders["adjacency_matrix"]:  batch_data['adjacency_matrix'],
                ggnn.placeholders["labels"]:  batch_data['labels']
            }
        )

        
        # print(attention_scores_data)
        # print(len(attention_scores_data[0]))
        
        correct_labels.extend(np.argmax(batch_data['labels'],axis=1))
        argmax = np.argmax(softmax_values_data,axis=1)
        predictions.extend(np.argmax(softmax_values_data,axis=1))

    print("Probability : " + str(softmax_values_data))
    print("Probability max : " + str(np.argmax(softmax_values_data,axis=1)))
    print("Correct class " + str(correct_labels[0]))
    print("Predicted class : " + str(predictions[0]))

    attention_path, raw_attention_score_dict = generate_attention_scores(opt, attention_scores_data[0])

    generate_subtree(opt, opt.stmt_ids_path, raw_attention_score_dict)

    print(attention_path)
    print(opt.pb_path)
    generate_visualization(opt.pb_path,attention_path)

    return softmax_values_data, argmax, str(correct_labels[0]), str(predictions[0])
def making_prediction(graph_path, opt, ggnn, sess):

     # For debugging purpose
    softmax_values = ggnn.softmax_values
    attention_scores = ggnn.attention_scores
    opt.test_graph_path = graph_path
   
    test_dataset = MonoLanguageProgramData(opt, False, False, True)

    batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5)

  
    for step, batch_data in enumerate(batch_iterator):
        # print(batch_data["labels"])

        print(batch_data['labels'])
        softmax_values_data, attention_scores_data = sess.run(
            [softmax_values, attention_scores],
            feed_dict={
                ggnn.placeholders["initial_node_representation"]: batch_data["initial_representations"],
                ggnn.placeholders["num_vertices"]: batch_data["num_vertices"],
                ggnn.placeholders["adjacency_matrix"]:  batch_data['adjacency_matrix'],
                ggnn.placeholders["labels"]:  batch_data['labels']
            }
        )

       
        predicted_label = np.argmax(softmax_values_data,axis=1)
      

        print("Probability : " + str(softmax_values_data))
        print("Probability max : " + str(np.argmax(softmax_values_data,axis=1)))
    

    
    return predicted_label[0]
def main(opt):
    with open(opt.pretrained_embeddings_url, 'rb') as fh:
        embeddings, embed_lookup = pickle.load(fh,encoding='latin1')

        opt.pretrained_embeddings = embeddings
        opt.pretrained_embed_lookup = embed_lookup
    print("Finished loading pretrained embeddings......")
    
    checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt')   
    ckpt = tf.train.get_checkpoint_state(opt.model_path)
    
    test_dataset = MonoLanguageProgramData(opt, False, True)
    opt.n_edge_types = test_dataset.n_edge_types

    print("Num edge types : " + str(opt.n_edge_types))
    ggnn = DenseGGNNModel(opt)

    # For debugging purpose
    nodes_representation = ggnn.nodes_representation
    graph_representation = ggnn.graph_representation
    logits = ggnn.logits
    softmax_values = ggnn.softmax_values

    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init)

        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

        correct_labels = []
        predictions = []
        print('Computing training accuracy...')
      
        batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5)
        for step, batch_data in enumerate(batch_iterator):
            # print(batch_data["labels"])

            softmax_values_data = sess.run(
                [softmax_values],
                feed_dict={
                    ggnn.placeholders["initial_node_representation"]: batch_data["initial_representations"],
                    ggnn.placeholders["num_vertices"]: batch_data["num_vertices"],
                    ggnn.placeholders["adjacency_matrix"]:  batch_data['adjacency_matrix'],
                    ggnn.placeholders["labels"]:  batch_data['labels']
                }
            )

            correct_labels.extend(np.argmax(batch_data['labels'],axis=1))
            predictions.extend(np.argmax(softmax_values_data[0],axis=1))

        print("Num target : " + str(len(correct_labels)))
        # print(correct_labels)
        # print(predictions)
        target_names = [str(i) for i in range(1,11)]
        print('Accuracy:', accuracy_score(correct_labels, predictions))
        print(classification_report(correct_labels, predictions, target_names=target_names))
        print(confusion_matrix(correct_labels, predictions))
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--workers',
                        type=int,
                        help='number of data loading workers',
                        default=2)
    parser.add_argument('--train_batch_size',
                        type=int,
                        default=10,
                        help='input batch size')
    parser.add_argument('--test_batch_size',
                        type=int,
                        default=5,
                        help='input batch size')
    parser.add_argument('--state_dim',
                        type=int,
                        default=30,
                        help='GGNN hidden state dimension size')
    parser.add_argument('--node_dim',
                        type=int,
                        default=100,
                        help='node dimension size')
    parser.add_argument('--hidden_layer_size',
                        type=int,
                        default=200,
                        help='size of hidden layer')
    parser.add_argument('--num_hidden_layer',
                        type=int,
                        default=1,
                        help='number of hidden layer')
    parser.add_argument('--n_steps',
                        type=int,
                        default=10,
                        help='propogation steps number of GGNN')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='learning rate')
    parser.add_argument('--cuda', action='store_true', help='enables cuda')
    parser.add_argument('--verbal',
                        type=bool,
                        default=True,
                        help='print training info or not')
    parser.add_argument('--manualSeed', type=int, help='manual seed')
    parser.add_argument(
        '--test_file',
        default="program_data/test_data/5/100_dead_code_1.java",
        help="test program")
    parser.add_argument('--n_classes',
                        type=int,
                        default=10,
                        help='manual seed')
    parser.add_argument('--path',
                        default="program_data/github_java_sort_function_babi",
                        help='program data')
    parser.add_argument('--model_path',
                        default="model",
                        help='path to save the model')
    parser.add_argument('--n_hidden',
                        type=int,
                        default=50,
                        help='number of hidden layers')
    parser.add_argument('--size_vocabulary',
                        type=int,
                        default=59,
                        help='maximum number of node types')
    parser.add_argument('--log_path',
                        default="logs/",
                        help='log path for tensorboard')
    parser.add_argument(
        '--aggregation',
        type=int,
        default=1,
        choices=range(0, 4),
        help=
        '0 for max pooling, 1 for attention with sum pooling, 2 for attention with max pooling, 3 for attention with average pooling'
    )
    parser.add_argument('--distributed_function',
                        type=int,
                        default=0,
                        choices=range(0, 2),
                        help='0 for softmax, 1 for sigmoid')
    parser.add_argument(
        '--pretrained_embeddings_url',
        default="embedding/fast_pretrained_vectors.pkl.gz",
        help=
        'pretrained embeddings url, there are 2 objects in this file, the first object is the embedding matrix, the other is the lookup dictionary'
    )
    parser.add_argument('argv', nargs="+", help='filenames')
    opt = parser.parse_args()
    print(opt)

    opt.model_path = os.path.join(
        opt.model_path, "sum_softmax" + "_hidden_layer_size_" +
        str(opt.hidden_layer_size) + "_num_hidden_layer_" +
        str(opt.num_hidden_layer)) + "_node_dim_" + str(opt.node_dim)

    if len(opt.argv) == 1:
        opt.test_file = opt.argv[0]
    # Create model path folder if not exists
    if not os.path.exists(opt.model_path):
        print("Cannot find path : " + opt.model_path)

    generate_files(opt, opt.test_file)

    # if not os.path.exists(opt.pretrained_embeddings_url):
    #     fetch_data_from_github(opt.pretrained_embeddings_url)
    with gzip.open(opt.pretrained_embeddings_url, 'rb') as fh:
        embeddings, embed_lookup = pickle.load(fh, encoding='latin1')
        opt.pretrained_embeddings = embeddings
        opt.pretrained_embed_lookup = embed_lookup

    checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt')
    # for f in ['checkpoint', 'cnn_tree.ckpt.index', 'cnn_tree.ckpt.meta', 'cnn_tree.ckpt.data-00000-of-00001']:
    #     filename = os.path.join(opt.model_path, f)
    #     if not os.path.exists(filename):
    #         fetch_data_from_github(filename)

    ckpt = tf.train.get_checkpoint_state(opt.model_path)

    test_dataset = MonoLanguageProgramData(opt, False, False, True)
    # opt.n_edge_types = test_dataset.n_edge_types
    opt.n_edge_types = 7

    ggnn = DenseGGNNModel(opt)

    # For debugging purpose
    nodes_representation = ggnn.nodes_representation
    graph_representation = ggnn.graph_representation
    logits = ggnn.logits
    softmax_values = ggnn.softmax_values
    attention_scores = ggnn.attention_scores

    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)

        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

        correct_labels = []
        predictions = []
        print('Computing training accuracy...')

        batch_iterator = ThreadedIterator(
            test_dataset.make_minibatch_iterator(), max_queue_size=5)
        for step, batch_data in enumerate(batch_iterator):
            # print(batch_data["labels"])

            softmax_values_data, attention_scores_data = sess.run(
                [softmax_values, attention_scores],
                feed_dict={
                    ggnn.placeholders["initial_node_representation"]:
                    batch_data["initial_representations"],
                    ggnn.placeholders["num_vertices"]:
                    batch_data["num_vertices"],
                    ggnn.placeholders["adjacency_matrix"]:
                    batch_data['adjacency_matrix'],
                    ggnn.placeholders["labels"]:
                    batch_data['labels']
                })

            print(softmax_values_data)
            # print(attention_scores_data)
            # print(len(attention_scores_data[0]))

            correct_labels.extend(np.argmax(batch_data['labels'], axis=1))
            predictions.extend(np.argmax(softmax_values_data, axis=1))

        print("Num target : " + str(len(correct_labels)))
        print("True label : " + str(correct_labels[0]))
        print("Predicted label : " + str(predictions[0]))

        attention_path = generate_attention_scores(opt,
                                                   attention_scores_data[0])
        print(attention_path)
        print(opt.pb_path)
        generate_visualization(opt.pb_path, attention_path)
Ejemplo n.º 6
0
def main(opt):

    opt.model_path = os.path.join(opt.model_path, form_model_path(opt))
    checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt')
    ckpt = tf.train.get_checkpoint_state(opt.model_path)
    print("The model path : " + str(checkfile))
    print("Loss : " + str(opt.loss))
    if ckpt and ckpt.model_checkpoint_path:
        print("Continue training with old model : " + str(checkfile))

    print("Loading vocabs.........")
    node_type_lookup, node_token_lookup, subtree_lookup = load_vocabs(opt)

    opt.node_type_lookup = node_type_lookup
    opt.node_token_lookup = node_token_lookup
    opt.subtree_lookup = subtree_lookup

    if opt.task == 1:
        train_dataset = CodeClassificationData(opt, True, False, False)

    if opt.task == 0:
        val_opt = copy.deepcopy(opt)
        val_opt.node_token_lookup = node_token_lookup
        validation_dataset = CodeClassificationData(val_opt, False, False,
                                                    True)

    print("Initializing tree caps model...........")
    corder = CorderModel(opt)
    print("Finished initializing corder model...........")

    loss_node = corder.loss
    optimizer = RAdamOptimizer(opt.lr)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        training_point = optimizer.minimize(loss_node)
    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)

    init = tf.global_variables_initializer()

    # best_f1_score = get_best_f1_score(opt)
    # print("Best f1 score : " + str(best_f1_score))

    with tf.Session() as sess:
        sess.run(init)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

        if opt.task == 1:
            for epoch in range(1, opt.epochs + 1):
                train_batch_iterator = ThreadedIterator(
                    train_dataset.make_minibatch_iterator(),
                    max_queue_size=opt.worker)
                train_accs = []
                for train_step, train_batch_data in enumerate(
                        train_batch_iterator):
                    print("--------------------------")

                    # print(train_batch_data["batch_subtrees_ids"])
                    logging.info(str(train_batch_data["batch_subtree_id"]))
                    _, err = sess.run(
                        [training_point, corder.loss],
                        feed_dict={
                            corder.placeholders["node_types"]:
                            train_batch_data["batch_node_types"],
                            corder.placeholders["node_tokens"]:
                            train_batch_data["batch_node_tokens"],
                            corder.placeholders["children_indices"]:
                            train_batch_data["batch_children_indices"],
                            corder.placeholders["children_node_types"]:
                            train_batch_data["batch_children_node_types"],
                            corder.placeholders["children_node_tokens"]:
                            train_batch_data["batch_children_node_tokens"],
                            corder.placeholders["labels"]:
                            train_batch_data["batch_subtree_id"],
                            corder.placeholders["dropout_rate"]:
                            0.3
                        })

                    logging.info("Training at epoch " + str(epoch) +
                                 " and step " + str(train_step) +
                                 " with loss " + str(err))
                    print("Epoch:", epoch, "Step:", train_step,
                          "Training loss:", err)
                    if train_step % opt.checkpoint_every == 0 and train_step > 0:
                        saver.save(sess, checkfile)
                        print('Checkpoint saved, epoch:' + str(epoch) +
                              ', step: ' + str(train_step) + ', loss: ' +
                              str(err) + '.')

        if opt.task == 0:
            validation_batch_iterator = ThreadedIterator(
                validation_dataset.make_minibatch_iterator(),
                max_queue_size=opt.worker)

            for val_step, val_batch_data in enumerate(
                    validation_batch_iterator):
                scores = sess.run(
                    [corder.code_vector],
                    feed_dict={
                        corder.placeholders["node_types"]:
                        val_batch_data["batch_node_types"],
                        corder.placeholders["node_tokens"]:
                        val_batch_data["batch_node_tokens"],
                        corder.placeholders["children_indices"]:
                        val_batch_data["batch_children_indices"],
                        corder.placeholders["children_node_types"]:
                        val_batch_data["batch_children_node_types"],
                        corder.placeholders["children_node_tokens"]:
                        val_batch_data["batch_children_node_tokens"],
                        corder.placeholders["dropout_rate"]:
                        0.0
                    })

                for i, vector in enumerate(scores[0]):
                    file_name = "analysis/rosetta_sampled_softmax_train.csv"
                    with open(file_name, "a") as f:
                        vector_score = []
                        for score in vector:
                            vector_score.append(str(score))
                        # print(val_batch_data["batch_file_path"])
                        line = str(val_batch_data["batch_file_path"]
                                   [i]) + "," + " ".join(vector_score)
                        f.write(line)
                        f.write("\n")
Ejemplo n.º 7
0
def main(opt):
    with open(opt.pretrained_embeddings_url, 'rb') as fh:
        embeddings, embed_lookup = pickle.load(fh,encoding='latin1')

        opt.pretrained_embeddings = embeddings
        opt.pretrained_embed_lookup = embed_lookup

    
    checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt')   
    ckpt = tf.train.get_checkpoint_state(opt.model_path)
    
    train_dataset = MonoLanguageProgramData(opt, True, False)
    test_dataset = MonoLanguageProgramData(opt, False, True)

    opt.n_edge_types = train_dataset.n_edge_types

    ggnn = DenseGGNNModel(opt)

    # For debugging purpose
    nodes_representation = ggnn.nodes_representation
    graph_representation = ggnn.graph_representation
    logits = ggnn.logits
    softmax_values = ggnn.softmax_values
    attention_scores = ggnn.attention_scores

    loss_node = ggnn.loss

    optimizer = tf.train.AdamOptimizer(opt.lr)
    training_point = optimizer.minimize(loss_node)

    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
    init = tf.global_variables_initializer()
    # with open("model_selection.txt","r") as f:

    with tf.Session() as sess:
        sess.run(init)

        print("List of available devices..........")
        print(tf.test.gpu_device_name())

        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

        best_accuracy = opt.best_accuracy
        for epoch in range(1,  opt.epochs + 1):
            train_batch_iterator = ThreadedIterator(train_dataset.make_minibatch_iterator(), max_queue_size=5)


            for train_step, train_batch_data in enumerate(train_batch_iterator):
                # print(batch_data["labels"])

                _ , err, softmax_values_data, attention_scores_data = sess.run(
                    [training_point, loss_node, softmax_values, attention_scores],
                    feed_dict={
                        ggnn.placeholders["initial_node_representation"]: train_batch_data["initial_representations"],
                        ggnn.placeholders["num_vertices"]: train_batch_data["num_vertices"],
                        ggnn.placeholders["adjacency_matrix"]:  train_batch_data['adjacency_matrix'],
                        ggnn.placeholders["labels"]:  train_batch_data['labels']
                    }
                )

                print("Epoch:", epoch, "Step:",train_step,"Loss:", err, "Best Accuracy:", best_accuracy)

                if train_step % opt.checkpoint_every == 0:
                    
                    # saver.save(sess, checkfile)                  
                    # print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' + str(step) + ', loss: ' + str(err) + '.')



                    # Validating
                    #--------------------------------------
                    print("Validating.......")
                    correct_labels = []
                    predictions = []
                    test_batch_iterator = ThreadedIterator(test_dataset.make_minibatch_iterator(), max_queue_size=5)
                   
                    for test_step, test_batch_data in enumerate(test_batch_iterator):
                        

                        softmax_values_data = sess.run(
                            [softmax_values],
                            feed_dict={
                                ggnn.placeholders["initial_node_representation"]: test_batch_data["initial_representations"],
                                ggnn.placeholders["num_vertices"]: test_batch_data["num_vertices"],
                                ggnn.placeholders["adjacency_matrix"]:  test_batch_data['adjacency_matrix'],
                                ggnn.placeholders["labels"]:  test_batch_data['labels']
                            }
                        )
                        correct_labels.extend(np.argmax(test_batch_data['labels'],axis=1))
                        predictions.extend(np.argmax(softmax_values_data[0],axis=1))

                    print("Num target : " + str(len(correct_labels)))
                    print(correct_labels)
                    print(predictions)
                    target_names = [str(i) for i in range(1,11)]
                    accuracy = float(accuracy_score(correct_labels, predictions))
                    print('Accuracy:', accuracy)
                    print(classification_report(correct_labels, predictions, target_names=target_names))
                    print(confusion_matrix(correct_labels, predictions))

                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        saver.save(sess, checkfile)                  
                        print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' + str(train_step) + ', loss: ' + str(err) + '.')
def main(opt):
    
    opt.model_path = os.path.join(opt.model_path, form_model_path(opt))
    checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt')
    ckpt = tf.train.get_checkpoint_state(opt.model_path)
    if ckpt and ckpt.model_checkpoint_path:
        print("Continue training with old model : " + str(checkfile))

    print("Loading vocabs.........")
    train_label_lookup, node_type_lookup, node_token_lookup, val_label_lookup = load_vocabs(opt)

    opt.label_lookup = train_label_lookup
    opt.label_size = len(train_label_lookup.keys())
    opt.node_type_lookup = node_type_lookup
    opt.node_token_lookup = node_token_lookup

    if opt.task == 1:
        train_dataset = MethodNamePredictionData(opt, opt.train_path, True, False, False)
    
    val_opt = copy.deepcopy(opt)
    val_opt.label_lookup = val_label_lookup
    val_opt.num_labels = len(val_label_lookup.keys())
    val_opt.node_token_lookup = node_token_lookup
    validation_dataset = MethodNamePredictionData(val_opt, opt.val_path, False, False, True)

    print("Initializing tree caps model...........")
    treecaps = TreeCapsModel(opt)
    # network.init_net_treecaps(30,30)
    print("Finished initializing tree caps model...........")

    code_caps = treecaps.code_caps
    loss_node = treecaps.loss
    softmax_values = treecaps.softmax_values
    logits = treecaps.logits
    optimizer = RAdamOptimizer(opt.lr)
    # optimizer = tf.compat.v1.train.AdamOptimizer(opt.lr)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        training_point = optimizer.minimize(loss_node)
    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
  
    init = tf.global_variables_initializer()

    best_f1_score = get_best_f1_score(opt)
    print("Best f1 score : " + str(best_f1_score))


    num_caps_top_a = int(opt.num_conv*opt.output_size/opt.num_channel)*opt.top_a
    with tf.Session() as sess:
        sess.run(init)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

        validation_batch_iterator = ThreadedIterator(validation_dataset.make_minibatch_iterator(), max_queue_size=5)         
        # f1_scores_of_val_data = []
        all_predicted_labels = []
        all_ground_truth_labels = []

        for val_step, val_batch_data in enumerate(validation_batch_iterator):
            alpha_IJ_shape = (opt.batch_size, int(num_caps_top_a/opt.top_a*val_batch_data["batch_node_types"].shape[1]), num_caps_top_a)
            alpha_IJ = np.zeros(alpha_IJ_shape)
                        
            scores, alpha_IJ_scores= sess.run(
                [logits, treecaps.alpha_IJ],
                feed_dict={
                    treecaps.placeholders["node_types"]: val_batch_data["batch_node_types"],
                    treecaps.placeholders["node_tokens"]:  val_batch_data["batch_node_tokens"],
                    treecaps.placeholders["children_indices"]:  val_batch_data["batch_children_indices"],
                    treecaps.placeholders["children_node_types"]: val_batch_data["batch_children_node_types"],
                    treecaps.placeholders["children_node_tokens"]: val_batch_data["batch_children_node_tokens"],
                    treecaps.placeholders["labels"]: val_batch_data["batch_labels"],
                    treecaps.placeholders["alpha_IJ"]: alpha_IJ,
                    treecaps.placeholders["is_training"]: False
                }
            )
            alpha_IJ_scores = np.reshape(alpha_IJ_scores, (opt.batch_size, val_batch_data["batch_node_types"].shape[1], 8,  opt.top_a, 8))
            alpha_IJ_scores = np.sum(alpha_IJ_scores, axis=2)
            alpha_IJ_scores = np.sum(alpha_IJ_scores, axis=3)
            
            
            alpha_IJ_scores = np.squeeze(alpha_IJ_scores, axis=0)
            alpha_IJ_scores = np.transpose(alpha_IJ_scores)
            
            
            predictions = np.argmax(scores, axis=1)
        
            ground_truths = np.argmax(val_batch_data['batch_labels'], axis=1)
        
            predicted_labels = []
            for prediction in predictions:
                predicted_labels.append(train_label_lookup.inverse[prediction])

            ground_truth_labels = []
            for ground_truth in ground_truths:
                ground_truth_labels.append(
                    val_label_lookup.inverse[ground_truth])
          
            f1_score = evaluation.calculate_f1_scores(predicted_labels, ground_truth_labels)
            print(ground_truth_labels)
            print(predicted_labels)
            print("F1:", f1_score, "Step:", val_step)

            if f1_score > 0:
        
                node_types = val_batch_data["batch_node_types"][0]
                node_tokens_text = val_batch_data["batch_node_tokens_text"][0]
                node_indexes = val_batch_data["batch_node_indexes"][0]

                file_path = val_batch_data["batch_file_path"][0]
                file_path_splits = file_path.split("/")
                file_path_splits[1] = "java-small"
                file_path_splits[len(file_path_splits) - 1] = file_path_splits[len(file_path_splits) - 1].replace(".pkl",".java")
                file_path = "/".join(file_path_splits)
            
                analysis_folder = os.path.join("analysis", "_".join(file_path_splits[-2:]).replace(".java",""))
                try:
                    from pathlib import Path
                    Path(analysis_folder).mkdir(parents=True, exist_ok=True)
                except Exception as e:
                    print(e)

                count = 0
                for capsule in alpha_IJ_scores:
                    # print(val_batch_data["batch_node_indexes"])
                    
                    connection_strength = capsule                
                    all_tuples = []
                    for i, node_index in enumerate(node_indexes):
                        tuple_of_info = []
                        tuple_of_info.append(str(node_indexes[i]))
                        tuple_of_info.append(str(node_types[i]))              
                        tuple_of_info.append(str(connection_strength[i]))
                        tuple_of_info.append(node_tokens_text[i])
                    
                        tuple_of_info = tuple(tuple_of_info)
                        # print(tuple_of_info)

                        all_tuples.append(tuple_of_info)
                    
                    all_tuples = sorted(all_tuples, key=lambda x: x[2])
                    all_tuples.reverse()

                    with open(os.path.join(analysis_folder, "Group_" + str(count) + ".txt"), "w") as f:
                        for t in all_tuples:
                            line = ";".join(list(t))
                            f.write(line)
                            f.write("\n")

                    with open(os.path.join(analysis_folder, "result.txt"), "w") as f1:
                        f1.write("Predicted : " + str(predicted_labels[0]))
                        f1.write("\n")
                        f1.write("Ground truth : " + str(ground_truth_labels[0]))
                        f1.write("\n")
                    
                    import shutil
                    try:
                        print("Trying to copy original source file....")
                        shutil.copy(file_path, analysis_folder)
                    except Exception as e:
                        print(e)
                        
                    count += 1
Ejemplo n.º 9
0
    def run_epoch(self, epoch_name: str, epoch_num, data, is_training: bool):
        loss = 0
        mean_edge_loss = 0
        mean_node_loss = 0
        mean_kl_loss = 0
        mean_qed_loss = 0
        node_loss_error = -10000000
        node_pred_error = 0
        start_time = time.time()
        processed_graphs = 0
        if is_training and self.params['num_teacher_forcing'] >= epoch_num:
            teacher_forcing = True
        else:
            teacher_forcing = False
        batch_iterator = ThreadedIterator(
            self.make_minibatch_iterator(data, is_training),
            max_queue_size=self.params['batch_size']
        )  # self.params['batch_size'])

        for step, batch_data in enumerate(batch_iterator):
            num_graphs = batch_data[self.placeholders['num_graphs']]
            processed_graphs += num_graphs
            batch_data[self.placeholders['is_generative']] = False
            batch_data[self.placeholders[
                'use_teacher_forcing_nodes']] = teacher_forcing
            batch_data[
                self.placeholders['z_prior']] = utils.generate_std_normal(
                    self.params['batch_size'],
                    batch_data[self.placeholders['num_vertices']],
                    self.params['hidden_size_encoder'])

            if is_training:
                batch_data[self.placeholders[
                    'out_layer_dropout_keep_prob']] = self.params[
                        'out_layer_dropout_keep_prob']
                fetch_list = [
                    self.ops['loss'], self.ops['train_step'],
                    self.ops["edge_loss"], self.ops['kl_loss'],
                    self.ops['node_symbol_prob'],
                    self.placeholders['node_symbols'],
                    self.ops['qed_computed_values'],
                    self.placeholders['target_values'],
                    self.ops['total_qed_loss'], self.ops['mean'],
                    self.ops['logvariance'], self.ops['grads'],
                    self.ops['mean_edge_loss'],
                    self.ops['mean_node_symbol_loss'],
                    self.ops['mean_kl_loss'], self.ops['mean_total_qed_loss'],
                    self.ops['grads2'], self.ops['node_loss_error'],
                    self.ops['node_pred_error']
                ]
            else:
                batch_data[
                    self.placeholders['out_layer_dropout_keep_prob']] = 1.0
                fetch_list = [
                    self.ops['loss'], self.ops['mean_edge_loss'],
                    self.ops['mean_node_symbol_loss'],
                    self.ops['mean_kl_loss'], self.ops['mean_total_qed_loss'],
                    self.ops['sampled_atoms'], self.ops['node_loss_error'],
                    self.ops['node_pred_error']
                ]
            result = self.sess.run(fetch_list, feed_dict=batch_data)
            batch_loss = result[0]
            loss += batch_loss * num_graphs
            if is_training:
                mean_edge_loss += result[12] * num_graphs
                mean_node_loss += result[13] * num_graphs
                mean_kl_loss += result[14] * num_graphs
                mean_qed_loss += result[15] * num_graphs
                node_loss_error = max(node_loss_error, np.max(result[17]))
                node_pred_error += result[18]
            else:
                mean_edge_loss += result[1] * num_graphs
                mean_node_loss += result[2] * num_graphs
                mean_kl_loss += result[3] * num_graphs
                mean_qed_loss += result[4] * num_graphs
                node_loss_error = max(node_loss_error, np.max(result[6]))
                node_pred_error += result[7]

            print(
                "Running %s, batch %i (has %i graphs). Total loss: %.4f. Edge loss: %.4f. Node loss: %.4f. KL loss: %.4f. Property loss: %.4f. Node error: %.4f. Node pred: %.4f."
                % (epoch_name, step, num_graphs, loss / processed_graphs,
                   mean_edge_loss / processed_graphs, mean_node_loss /
                   processed_graphs, mean_kl_loss / processed_graphs,
                   mean_qed_loss / processed_graphs, node_loss_error,
                   node_pred_error / processed_graphs),
                end='\r')

        mean_edge_loss /= processed_graphs
        mean_node_loss /= processed_graphs
        mean_kl_loss /= processed_graphs
        mean_qed_loss /= processed_graphs
        loss = loss / processed_graphs
        instance_per_sec = processed_graphs / (time.time() - start_time)
        return loss, mean_edge_loss, mean_node_loss, mean_kl_loss, mean_qed_loss, instance_per_sec
Ejemplo n.º 10
0
def main(opt):
    from pathlib import Path
    mis_prediction_path = os.path.join("mis_prediction", opt.transformation)
    Path(mis_prediction_path).mkdir(parents=True, exist_ok=True)

    opt.model_path = os.path.join(opt.model_path, form_model_path(opt))
    checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt')
    ckpt = tf.train.get_checkpoint_state(opt.model_path)
    if ckpt and ckpt.model_checkpoint_path:
        print("Continue training with old model : " + str(checkfile))

    train_label_lookup, node_type_lookup, node_token_lookup, val_label_lookup = load_vocabs(
        opt)

    opt.label_lookup = train_label_lookup
    opt.num_labels = len(train_label_lookup.keys())
    opt.node_type_lookup = node_type_lookup
    opt.node_token_lookup = node_token_lookup

    if opt.task == 1:
        train_dataset = MethodNamePredictionData(opt, opt.train_path, True,
                                                 False, False)

    val_opt = copy.deepcopy(opt)
    val_opt.label_lookup = val_label_lookup
    val_opt.num_labels = len(val_label_lookup.keys())
    val_opt.node_token_lookup = node_token_lookup
    validation_dataset = MethodNamePredictionData(val_opt, opt.val_path, False,
                                                  False, True)

    ggnn = DenseGGNNModel(opt)

    # For debugging purpose
    nodes_representation = ggnn.nodes_representation
    graph_representation = ggnn.graph_representation
    logits = ggnn.logits
    label_embeddings = ggnn.label_embeddings
    softmax_values = ggnn.softmax_values
    attention_scores = ggnn.attention_scores
    loss_node = ggnn.loss

    optimizer = tf.compat.v1.train.AdamOptimizer(opt.lr)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        training_point = optimizer.minimize(loss_node)

    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
    init = tf.global_variables_initializer()

    best_f1_score = get_best_f1_score(opt)
    print("Best f1 score : " + str(best_f1_score))
    with tf.Session() as sess:
        sess.run(init)

        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

            print("Testing model.............")
            average_f1 = 0.0
            validation_batch_iterator = ThreadedIterator(
                validation_dataset.make_minibatch_iterator(), max_queue_size=5)

            all_predicted_labels = []
            all_ground_truth_labels = []
            all_paths = []
            for val_step, val_batch_data in enumerate(
                    validation_batch_iterator):
                print("----------------------------------------")

                label_embeddings_matrix, scores = sess.run(
                    [label_embeddings, logits],
                    feed_dict={
                        ggnn.placeholders["num_vertices"]:
                        val_batch_data["num_vertices"],
                        ggnn.placeholders["adjacency_matrix"]:
                        val_batch_data['adjacency_matrix'],
                        ggnn.placeholders["node_type_indices"]:
                        val_batch_data["node_type_indices"],
                        ggnn.placeholders["node_token_indices"]:
                        val_batch_data["node_token_indices"],
                        ggnn.placeholders["is_training"]:
                        False
                    })

                predictions = np.argmax(scores, axis=1)
                ground_truths = np.argmax(val_batch_data['labels'], axis=1)

                predicted_labels = []
                for prediction in predictions:
                    predicted_labels.append(
                        train_label_lookup.inverse[prediction])

                ground_truth_labels = []
                for ground_truth in ground_truths:
                    ground_truth_labels.append(
                        val_label_lookup.inverse[ground_truth])

                # all_predicted_labels.extend(predicted_labels)
                # all_ground_truth_labels.extend(ground_truth_labels)

                for i, file_path in enumerate(val_batch_data["paths"]):
                    ground_truth = ground_truth_labels[i]
                    predicted = predicted_labels[i]

                    with open(mis_prediction_path, "a") as f10:
                        line = file_path + "," + ground_truth + "," + predicted
                        f10.write(line)
                        f10.write("\n")
Ejemplo n.º 11
0
def main(opt):
	
	train_label_lookup = {}
	train_label_lookup_by_index = {}
	train_node_type_lookup = {}
	train_node_token_lookup = {}

	val_label_lookup = {}
	val_label_lookup_by_index = {}
	val_node_type_lookup = {}
	val_node_token_lookup = {}

	node_type_vocabulary_path = "preprocessed_data/node_type_vocab.txt"
	
	train_label_vocabulary_path = "preprocessed_data/train_label_vocab.txt"
	train_token_vocabulary_path = "preprocessed_data/train_token_vocab.txt"

	val_label_vocabulary_path = "preprocessed_data/val_label_vocab.txt"
	val_token_vocabulary_path = "preprocessed_data/val_token_vocab.txt"

	with open(train_label_vocabulary_path, "r") as f1:
		data = f1.readlines()
		for line in data:
			splits = line.replace("\n","").split(",")
			train_label_lookup[splits[1]] = int(splits[0])
			train_label_lookup_by_index[int(splits[0])] = splits[1]

	with open(node_type_vocabulary_path, "r") as f2:
		data = f2.readlines()
		for line in data:
			splits = line.replace("\n","").split(",")
			train_node_type_lookup[splits[1]] = int(splits[0])

	with open(train_token_vocabulary_path, "r") as f3:
		data = f3.readlines()
		for line in data:
			splits = line.replace("\n","").split(",")
			train_node_token_lookup[splits[1]] = int(splits[0])

	with open(val_label_vocabulary_path, "r") as f4:
		data = f4.readlines()
		for line in data:
			splits = line.replace("\n","").split(",")
			val_label_lookup[splits[1]] = int(splits[0])
			val_label_lookup_by_index[int(splits[0])] = splits[1]

	with open(val_token_vocabulary_path, "r") as f5:
		data = f5.readlines()
		for line in data:
			splits = line.replace("\n","").split(",")
			val_node_token_lookup[splits[1]] = int(splits[0])

	train_node_token_lookup["captain_america"] = len(train_node_token_lookup.keys())
	val_node_token_lookup["captain_america"] = len(val_node_token_lookup.keys())

	checkfile = os.path.join(opt.model_path, 'cnn_tree.ckpt')
	ckpt = tf.train.get_checkpoint_state(opt.model_path)

	# print(train_label_lookup)
	opt.label_lookup = train_label_lookup
	opt.num_labels = len(train_label_lookup.keys())
	opt.node_type_lookup = train_node_type_lookup
	opt.node_token_lookup = train_node_token_lookup
	opt.path = "sample_data/java-small-graph/training"

	train_dataset = MethodNamePredictionData(opt, True, False, False)
	opt.n_edge_types = train_dataset.n_edge_types


	val_opt = copy.deepcopy(opt)
	val_opt.path = "sample_data/java-small-graph/validation"
	val_opt.label_lookup = val_label_lookup
	val_opt.num_labels = len(val_label_lookup.keys())
	val_opt.node_token_lookup = val_node_token_lookup
	validation_dataset = MethodNamePredictionData(val_opt, False, False, True)

	ggnn = DenseGGNNModel(opt)

	# For debugging purpose
	nodes_representation = ggnn.nodes_representation
	graph_representation = ggnn.graph_representation
	logits = ggnn.logits
	softmax_values = ggnn.softmax_values
	attention_scores = ggnn.attention_scores
	loss_node = ggnn.loss

	optimizer = tf.train.AdamOptimizer(opt.lr)
	training_point = optimizer.minimize(loss_node)

	saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
	init = tf.global_variables_initializer()

	with tf.Session() as sess:
		sess.run(init)

		if ckpt and ckpt.model_checkpoint_path:
			print("Continue training with old model")
			print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
			saver.restore(sess, ckpt.model_checkpoint_path)
			for i, var in enumerate(saver._var_list):
				print('Var {}: {}'.format(i, var))

		for epoch in range(1,  opt.epochs + 1):
			train_batch_iterator = ThreadedIterator(train_dataset.make_minibatch_iterator(), max_queue_size=1)
			for train_step, train_batch_data in enumerate(train_batch_iterator):	
				_ , err, softmax_values_data, attention_scores_data = sess.run(
					[training_point, loss_node, softmax_values, attention_scores],
					feed_dict={
						ggnn.placeholders["num_vertices"]: train_batch_data["num_vertices"],
						ggnn.placeholders["adjacency_matrix"]:  train_batch_data['adjacency_matrix'],
						ggnn.placeholders["labels"]:  train_batch_data['labels'],
						ggnn.placeholders["node_type_indices"]: train_batch_data["node_type_indices"],
						ggnn.placeholders["node_token_indices"]: train_batch_data["node_token_indices"],
						ggnn.placeholders["graph_state_keep_prob"]: 0.5,
						ggnn.placeholders["edge_weight_dropout_keep_prob"]: 0.5
					}
				)

				print("Epoch:", epoch, "Step:",train_step,"Loss:", err)

				if train_step % opt.checkpoint_every == 0:
					#--------------------------------------
					print("Validating.......")
					# predictions = []
					validation_batch_iterator = ThreadedIterator(validation_dataset.make_minibatch_iterator(), max_queue_size=5)

					for _, val_batch_data in enumerate(validation_batch_iterator):
					    
						# Note: putting ggnn.placeholders["labels"]:  train_batch_data['labels'] seems stupid but it is a work-around, num labels in train data vs validation data is different
					    softmax_values_data = sess.run(
					        [softmax_values],
					        feed_dict={
					            ggnn.placeholders["num_vertices"]: val_batch_data["num_vertices"],
								ggnn.placeholders["adjacency_matrix"]:  val_batch_data['adjacency_matrix'],
								ggnn.placeholders["labels"]:  train_batch_data['labels'],
								ggnn.placeholders["node_type_indices"]: val_batch_data["node_type_indices"],
								ggnn.placeholders["node_token_indices"]: val_batch_data["node_token_indices"],
								ggnn.placeholders["graph_state_keep_prob"]: 1.0,
								ggnn.placeholders["edge_weight_dropout_keep_prob"]: 1.0
					        }
					    )
					    predictions = np.argmax(softmax_values_data[0],axis=1)
					    ground_truths = np.argmax(val_batch_data['labels'],axis=1)
					    # print(ground_truths)

					    predicted_labels = []
					    for prediction in predictions:
					    	predicted_labels.append(train_label_lookup_by_index[prediction])
					    
					    ground_truth_labels = []
					    for ground_truth in ground_truths:
					    	ground_truth_labels.append(val_label_lookup_by_index[ground_truth])

					    predicted_labels = transform_data(predicted_labels)
					    ground_truth_labels = transform_data(ground_truth_labels)
					    print("----------")
					    print("Predicted: " + str(predicted_labels))
					    print("Ground truth: " + str(ground_truth_labels))