Beispiel #1
0
def test_model(logdir, inputs, left_embedfile, right_embedfile, epochs=EPOCHS):
    """Train a classifier to label ASTs"""

    n_classess = 2
    # left_algo_labels = ['mergesort', 'linkedlist', 'quicksort', 'bfs', 'bubblesort', 'knapsack']
    # right_algo_labels = ['mergesort', 'linkedlist', 'quicksort', 'bfs', 'bubblesort', 'knapsack']
    left_algo_labels = [
        "bfs", "bubblesort", "knapsack", "linkedlist", "mergesort",
        "quicksort", "heap", "dfs", "stack", "queue"
    ]
    right_algo_labels = [
        "bfs", "bubblesort", "knapsack", "linkedlist", "mergesort",
        "quicksort", "heap", "dfs", "stack", "queue"
    ]
    # with open(left_inputs, 'rb') as fh:
    #     _, left_trees, left_algo_labels = pickle.load(fh)

    # with open(right_inputs, 'rb') as fh:
    #     _, right_trees, right_algo_labels = pickle.load(fh)
    with open(inputs, "rb") as fh:
        testing_pairs = pickle.load(fh)
    print "Loading embdding vectors...."
    with open(left_embedfile, 'rb') as fh:
        left_embeddings, left_embed_lookup = pickle.load(fh)

    with open(right_embedfile, 'rb') as fh:
        right_embeddings, right_embed_lookup = pickle.load(fh)

    num_feats = len(left_embeddings[0])

    # build the inputs and outputs of the network
    left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese(
        num_feats)

    right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese(
        num_feats)

    merge_node = tf.concat([left_pooling_node, right_pooling_node], -1)

    hidden_node = network.hidden_layer(merge_node, 200, 200)
    # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False)

    hidden_node = network.hidden_layer(hidden_node, 200, 200)
    # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False)

    hidden_node = network.hidden_layer(hidden_node, 200, n_classess)

    out_node = network.out_layer(hidden_node)

    labels_node, loss_node = network.loss_layer(hidden_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    # tf.summary.scalar('loss', loss_node)

    ### init the graph
    sess = tf.Session()  #config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    left_trees, right_trees = get_trees_from_pairs(testing_pairs)

    using_vector_lookup_left = False
    if os.path.isfile("/input/config.json"):
        file_handler = open(config_file, 'r')
        contents = json.load(file_handler)
        using_vector_lookup_left = contents[
            'using_vector_lookup_left'] == "false"

    correct_labels = []
    predictions = []
    print('Computing testing accuracy...')
    for left_gen_batch, right_gen_batch in sampling.batch_random_samples_2_sides(
            left_trees, left_algo_labels, right_trees, right_algo_labels,
            left_embeddings, left_embed_lookup, right_embeddings,
            right_embed_lookup, using_vector_lookup_left, False,
            TEST_BATCH_SIZE):
        left_nodes, left_children, left_labels_one_hot, left_labels = left_gen_batch

        right_nodes, right_children, right_labels_one_hot, right_labels = right_gen_batch
        sim_labels, _ = get_one_hot_similarity_label(left_labels, right_labels)
        print("sim labels : " + str(sim_labels))
        output = sess.run(
            [out_node],
            feed_dict={
                left_nodes_node: left_nodes,
                left_children_node: left_children,
                right_nodes_node: right_nodes,
                right_children_node: right_children,
                labels_node: sim_labels
            })
        correct = np.argmax(sim_labels[0])
        predicted = np.argmax(output[0])
        check = (correct == predicted) and True or False
        print('Out:', output, "Status:", check)
        correct_labels.append(np.argmax(sim_labels[0]))
        predictions.append(np.argmax(output[0]))

    target_names = ["0", "1"]
    print('Accuracy:', accuracy_score(correct_labels, predictions))
    print(
        classification_report(correct_labels,
                              predictions,
                              target_names=target_names))
    print(confusion_matrix(correct_labels, predictions))
def train_model(logdir,
                inputs,
                embeddings_list_url,
                node_map_url,
                epochs=EPOCHS,
                with_drop_out=1,
                device="0"):
    os.environ['CUDA_VISIBLE_DEVICES'] = device

    print("Using device : " + device)
    print("Batch size : " + str(BATCH_SIZE))
    if int(with_drop_out) == 1:
        print("Training with drop out rate : " + str(DROP_OUT))
    n_classess = 2

    print("Loading training data....")
    # print "Using device : " + device
    with open(inputs, "rb") as fh:
        # all_1_pairs, all_0_pairs = pickle.load(fh)
        all_training_pairs = pickle.load(fh)

    random.shuffle(all_training_pairs)
    print("Loading embedding list.....")
    with open(embeddings_list_url, "rb") as embeddings_list_fh:
        embeddings_list = pickle.load(embeddings_list_fh)

    num_feats = len(embeddings_list[0])
    print("number of features : " + str(num_feats))

    print("Loading node map for looking up.....")
    with open(node_map_url, "rb") as node_map_fh:
        # all_1_pairs, all_0_pairs = pickle.load(fh)
        node_map = pickle.load(node_map_fh)

    # build the inputs and outputs of the network
    left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese(
        num_feats)

    right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese(
        num_feats)
    # with tf.device(device):
    print("Left pooling shape : " + str(tf.shape(left_pooling_node)))
    print("Right pooling shape : " + str(tf.shape(right_pooling_node)))
    merge_node = tf.concat([left_pooling_node, right_pooling_node], -1)
    print(tf.shape(merge_node))
    hidden_node = network.hidden_layer(merge_node, 600, 300)
    if int(with_drop_out) == 1:
        hidden_node = tf.layers.dropout(hidden_node,
                                        rate=DROP_OUT,
                                        training=True)

    hidden_node = network.hidden_layer(hidden_node, 300, 100)

    if int(with_drop_out) == 1:
        hidden_node = tf.layers.dropout(hidden_node,
                                        rate=DROP_OUT,
                                        training=True)

    hidden_node = network.hidden_layer(hidden_node, 100, n_classess)

    if int(with_drop_out) == 1:
        hidden_node = tf.layers.dropout(hidden_node,
                                        rate=DROP_OUT,
                                        training=True)

    out_node = network.out_layer(hidden_node)

    labels_node, loss_node = network.loss_layer(hidden_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    # tf.summary.scalar('loss', loss_node)

    # correct_prediction = tf.equal(tf.argmax(out_node,1), tf.argmax(labels_node,1))
    # accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    ### init the graph
    # config = tf.ConfigProto(allow_soft_placement=True)
    # config.gpu_options.allocator_type = 'BFC'
    # config.gpu_options.per_process_gpu_memory_fraction = 0.9
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    config = tf.ConfigProto()
    # config.gpu_options.allocator_type ='BFC'
    # config.gpu_options.allow_growth = True
    # config.gpu_options.per_process_gpu_memory_fraction = 0.98

    sess = tf.Session(config=config)

    # sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(logdir, sess.graph)
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
        # else:
        #     raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    using_vector_lookup_left = False
    if os.path.isfile("/input/config.json"):
        file_handler = open(config_file, 'r')
        contents = json.load(file_handler)
        using_vector_lookup_left = contents[
            'using_vector_lookup_left'] == "false"

    print("Begin training....")

    # with tf.device(device):
    for epoch in range(1, epochs + 1):
        # sample_1_pairs = random.sample(all_1_pairs,1000)
        # sample_0_pairs = random.sample(all_0_pairs,1000)

        # sample_training_pairs = random.sample(all_training_pairs,6400)

        shuffle_left_trees, shuffle_right_trees, labels = get_trees_from_pairs(
            all_training_pairs)
        print("Len left:", len(shuffle_left_trees), "Len right:",
              len(shuffle_right_trees))
        for left_gen_batch, right_gen_batch, labels_batch in sampling.batch_random_samples_2_sides(
                shuffle_left_trees, shuffle_right_trees, embeddings_list,
                node_map, labels, BATCH_SIZE):
            print("----------------------------------------------------")
            print("Len of label batch : " + str(labels_batch))
            left_nodes, left_children = left_gen_batch

            right_nodes, right_children = right_gen_batch

            sim_labels, sim_labels_num = get_one_hot_similarity_label(
                labels_batch)
            # print("sim labels : " + str(sim_labels))

            _, err, out, merge, labs = sess.run(
                [train_step, loss_node, out_node, merge_node, labels_node],
                feed_dict={
                    left_nodes_node: left_nodes,
                    left_children_node: left_children,
                    right_nodes_node: right_nodes,
                    right_children_node: right_children,
                    labels_node: sim_labels
                })

            # print "hidden : " + str(loss)
            print('Epoch:', epoch, 'Steps:', steps, 'Loss:', err,
                  "True Label vs Predicted Label:", zip(labs, out))

            # print('Epoch:', epoch,'Steps:', steps,'Loss:', err)
            if steps % CHECKPOINT_EVERY == 0:
                # save state so we can resume later
                saver.save(sess, os.path.join(checkfile), steps)
                print('Checkpoint saved.')

            steps += 1
        steps = 0
Beispiel #3
0
def test_model(logdir,
               inputs,
               embeddings_list_url,
               node_map_url,
               epochs=EPOCHS):
    """Train a classifier to label ASTs"""

    n_classess = 2

    print("Loading embedding list.....")
    with open(embeddings_list_url, "rb") as embeddings_list_fh:
        embeddings_list = pickle.load(embeddings_list_fh)

    num_feats = len(embeddings_list[0])
    print("number of features : " + str(num_feats))

    print("Loading node map for looking up.....")
    with open(node_map_url, "rb") as node_map_fh:
        # all_1_pairs, all_0_pairs = pickle.load(fh)
        node_map = pickle.load(node_map_fh)

    # build the inputs and outputs of the network
    left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese(
        num_feats)

    right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese(
        num_feats)

    merge_node = tf.concat([left_pooling_node, right_pooling_node], -1)

    hidden_node = network.hidden_layer(merge_node, 600, 300)
    # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False)

    hidden_node = network.hidden_layer(hidden_node, 300, 100)
    # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False)

    hidden_node = network.hidden_layer(hidden_node, 100, n_classess)

    out_node = network.out_layer(hidden_node)

    labels_node, loss_node = network.loss_layer(hidden_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    # tf.summary.scalar('loss', loss_node)

    ### init the graph
    sess = tf.Session()  #config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    correct_labels = []
    predictions = []
    print('Computing testing accuracy...')

    with open(
            inputs,
            "rb",
    ) as csvfile:
        # with codecs.open("data/test.csv", "r", encoding = "utf-8", errors = 'replace') as csvfile:
        test_data_reader = csv.DictReader(csvfile, delimiter=',')
        for row in test_data_reader:

            print("----------------------")
            print(smart_str(row['test_id']))
            print(smart_str(row['question1']))
            print(smart_str(row['question2']))
            try:
                left_tree, right_tree = get_trees(smart_str(row['question1']),
                                                  smart_str(row['question2']))
                left_nodes, left_children, right_nodes, right_children = sampling.patch_data(
                    left_tree, right_tree, embeddings_list, node_map)
                # for left_nodes, left_children, right_nodes, right_children in sampling.patch_data(left_tree, right_tree, embeddings_list, node_map):

                # left_nodes, left_children = left_gen_batch

                # right_nodes, right_children = right_gen_batch

                output = sess.run(
                    [out_node],
                    feed_dict={
                        left_nodes_node: left_nodes,
                        left_children_node: left_children,
                        right_nodes_node: right_nodes,
                        right_children_node: right_children
                    })
                print(output)
                predicted = np.argmax(output[0])
                print(predicted)
                with open("data/predict_proba2.csv", "a") as f2:
                    f2.write(row['test_id'] + "," +
                             str(format(output[0][0][1], "f")) + "\n")
            except Exception as e:
                print "Error : " + str(e)
                with open("data/predict_proba2.csv", "a") as f2:
                    f2.write(row['test_id'] + "," + "0" + "\n")