コード例 #1
0
def test_model(test_trees, labels, embeddings, embedding_lookup, opt):

    logdir = opt.model_path
    batch_size = opt.train_batch_size
    epochs = opt.niter
    num_feats = len(embeddings[0])

    random.shuffle(test_trees)

    # build the inputs and outputs of the network
    nodes_node, children_node, codecaps_node = network.init_net_treecaps(
        num_feats, len(labels))

    out_node = network.out_layer(codecaps_node)
    labels_node, loss_node = network.loss_layer(codecaps_node, len(labels))

    optimizer = RAdamOptimizer(opt.lr)
    train_step = optimizer.minimize(loss_node)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

    checkfile = os.path.join(logdir, 'tree_network.ckpt')

    correct_labels = []
    predictions = []
    print('Computing training accuracy...')
    for batch in sampling.batch_samples(
            sampling.gen_samples(test_trees, labels, embeddings,
                                 embedding_lookup), 1):
        nodes, children, batch_labels = batch
        output = sess.run([out_node],
                          feed_dict={
                              nodes_node: nodes,
                              children_node: children,
                          })
        correct_labels.append(np.argmax(batch_labels))
        predictions.append(np.argmax(output))

    target_names = list(labels)
    print(
        classification_report(correct_labels,
                              predictions,
                              target_names=target_names))
    print(confusion_matrix(correct_labels, predictions))
    print('*' * 50)
    print('Accuracy:', accuracy_score(correct_labels, predictions))
    print('*' * 50)
コード例 #2
0
def train_model(train_dataloader, val_dataloader, embeddings, embedding_lookup, opt):
    
    logdir = opt.model_path
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    train_left_trees = train_dataloader.left_trees
    train_right_trees = train_dataloader.right_trees
    train_labels = train_dataloader.labels

    val_left_trees = val_dataloader.left_trees
    val_right_trees = val_dataloader.right_trees
    val_labels = val_dataloader.labels

    n_classess = 2

    num_feats = len(embeddings[0])

    initializer = tf.contrib.layers.xavier_initializer()
    weights = {
        "w_t" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"),
        "w_l" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"),
        "w_r" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"),
        "w_attention" : tf.Variable(initializer([opt.feature_size,1]), name="w_attention")
    }


    biases = {
        "b_conv": tf.Variable(initializer([opt.feature_size,]), name="b_conv"),
    }

    left_nodes_node, left_children_node, right_nodes_node, right_children_node, hidden_node, left_score_node, right_score_node = network.init_net_for_siamese(
        num_feats,
        opt.feature_size,
        weights, 
        biases,
        opt.aggregation,
        opt.distributed_function
    )

    out_node = network.out_layer(hidden_node)

    labels_node, loss_node = network.loss_layer(hidden_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    sess = tf.Session()

    # sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))
        # else:
        #     raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0   

    print("Begin training....")

    # with tf.device(device):
    for epoch in range(1, epochs+1):
        print("----------------------------------------------------")
        for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(train_left_trees, train_right_trees, train_labels, embeddings, embedding_lookup, opt.train_batch_size,opt.batch_type):
           
            if opt.batch_type == "original":
                left_nodes, left_children = batch_left_trees
                right_nodes, right_children = batch_right_trees
            else:
                left_nodes, left_children, _ = batch_left_trees
                right_nodes, right_children, _ = batch_right_trees
            
            labels_one_hot = convert_labels_to_one_hot(batch_labels)
                
            _, err, out = sess.run(
                [train_step, loss_node, out_node],
                feed_dict={
                    left_nodes_node: left_nodes,
                    left_children_node: left_children,
                    right_nodes_node: right_nodes,
                    right_children_node: right_children,
                    labels_node: labels_one_hot
                }
            )

            # print "hidden : " + str(loss)
            print('Epoch:', epoch,'Steps:', steps,'Loss:', err)
         

            if steps % CHECKPOINT_EVERY == 0:
                # save state so we can resume later
                saver.save(sess, os.path.join(checkfile), steps)
                print('Checkpoint saved.')

    
            steps+=1
        steps = 0


        correct_labels = []
        predictions = []

        for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(val_left_trees, val_right_trees, val_labels, embeddings, embedding_lookup, opt.train_batch_size, opt.batch_type):

            if opt.batch_type == "original":
                left_nodes, left_children = batch_left_trees
                right_nodes, right_children = batch_right_trees
            else:
                left_nodes, left_children, _ = batch_left_trees
                right_nodes, right_children, _ = batch_right_trees

            
            labels_one_hot = convert_labels_to_one_hot(batch_labels)
                
            output = sess.run(
                [out_node],
                feed_dict={
                    left_nodes_node: left_nodes,
                    left_children_node: left_children,
                    right_nodes_node: right_nodes,
                    right_children_node: right_children,
                    labels_node: labels_one_hot
                }
            )
        
            correct = np.argmax(labels_one_hot, axis=1)
            predicted = np.argmax(output[0], axis=1)

            correct_labels.extend(correct)
            predictions.extend(predicted)

        print('Accuracy:', accuracy_score(correct_labels, predictions))
        print(classification_report(correct_labels, predictions))
        print(confusion_matrix(correct_labels, predictions))
コード例 #3
0
def main(opt):

    target_directory = "live_test/github_java/sort_function/"
    file_name = aggregation_name + "_" + distributed_function_name + "_function.csv"

    print("Loading embeddings....")
    with open(opt.embeddings_directory, 'rb') as fh:
        embeddings, embed_lookup = pickle.load(fh,encoding='latin1')
    labels = [str(i) for i in range(1, opt.n_classes+1)]
    logdir = opt.model_path
    batch_size = opt.test_batch_size
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    # Loading program file
    # test_trees, node_ids, node_types, subtree_ids, pkl_path = load_program(opt)

   
    # Init model
    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')   
    ckpt = tf.train.get_checkpoint_state(logdir)
    
    initializer = tf.contrib.layers.xavier_initializer()
    weights = {
        "w_t" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_t"),
        "w_l" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_l"),
        "w_r" : tf.Variable(initializer([node_embedding_size, opt.feature_size]), name="w_r"),
        "w_attention" : tf.Variable(initializer([opt.feature_size,1]), name="w_attention")
    }

    biases = {
        "b_conv": tf.Variable(initializer([opt.feature_size,]), name="b_conv"),
    }

    nodes_node, children_node, hidden_node, attention_score_node = network.init_net(
        node_embedding_size,
        len(labels),
        opt.feature_size,
        weights,
        biases,
        opt.aggregation,
        opt.distributed_function
    )
   
    out_node = network.out_layer(hidden_node)
    labels_node, loss_node = network.loss_layer(hidden_node, len(labels))

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)
    
    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()
    logdir = opt.model_path
    batch_size = opt.test_batch_size
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    with tf.Session() as sess:
        sess.run(init)
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

       

        test_trees, node_ids, node_types, pkl_path, pb_path = load_program(opt.test_file)
        

        attention_score_scaled_map = predict(sess, out_node, attention_score_node, nodes_node, children_node, pkl_path, pb_path, test_trees, labels, node_ids, node_types, embeddings, embed_lookup)
        
        attention_path = os.path.join(opt.test_file.split(".")[0] + ".csv")

        if os.path.exists(attention_path):
            os.remove(attention_path)
        with open(attention_path,"a") as f:
           for k, v in attention_score_scaled_map.items():
                f.write(str(k) + "," + str(v))
                f.write("\n")


        generate_visualization(pb_path, attention_path)
コード例 #4
0
def train_model(embeddings):
    dictt = {}
    listrec = []
    file_list = os.listdir('dataset/features/features2')
    z = 0
    for file in file_list:
        file_path = 'dataset/features/features2' + '/' + file
        if not os.path.exists(file_path):
            listrec.append(file)
            continue
        with open(file_path, 'r', encoding="utf-8") as faa:
            sample = json.loads(faa.read())
        if sample == "" or sample == " " or sample == "\n":
            z += 1
            listrec.append(file)
            continue
        dictt[file] = sample
    print("length of feature: " + str(len(dictt)))
    # print("invalid file number:", z)

    TrainDatalist = []
    file = open("dataset/dataset/train/" + data_type + ".txt", 'r')
    for line in file:
        if line == "" or line == " ":
            continue
        TrainDatalist.append(line)
    print('len ( TrainData ) = ', len(TrainDatalist))
    file.close()

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_finetune(
        feature_size, embeddingg, KERNEL)
    labels_node, loss_node = network.loss_layer(res)

    aaa = 0
    global_step = tf.Variable(0, trainable=False)
    learn_rate = tf.train.exponential_decay(LEARN_RATE, global_step,
                                            len(TrainDatalist), 0.9, True)
    optimizer = tf.train.GradientDescentOptimizer(learn_rate)
    train_step = optimizer.minimize(loss_node, global_step=global_step)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(
        config=config)  # config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())

    for global_step in range(1, EPOCHS + 1):
        k = 0
        random.shuffle(TrainDatalist)
        for line in TrainDatalist:
            line = line.rstrip('\n')
            train_data = line.split('\t')
            if len(train_data) != 3:
                continue
            k += 1
            if (train_data[0] in listrec) or (train_data[1] in listrec):
                continue
            nodes11, children1, nodes22, children2, batch_labels = getData_id_type(
                train_data, dictt, embeddings)
            _, err, r = sess.run(
                [train_step, loss_node, res],
                feed_dict={
                    nodes_node1: nodes11,
                    children_node1: children1,
                    nodes_node2: nodes22,
                    children_node2: children2,
                    labels_node: batch_labels,
                })
            learn_rate_var = sess.run(learn_rate)
            aaa += 1

    test_list = ['argouml', 'gwt', 'jruby', 'xstream', 'all']
    print('------------Model 4' + data_type + '------------')
    for name in test_list:
        print('start test: ' + name)
        correct_labels_test = []
        predictions_test = []
        for _ in range(0, 20):
            predictions_test.append([])

        ff = open("dataset/dataset/test/" + name + "/" + data_type + ".txt",
                  'r')
        line = "123"
        k = 0
        while line:
            line = ff.readline().rstrip('\n')
            test_data = line.split('\t')
            if len(test_data) != 3:
                continue
            if (test_data[0] in listrec) or (test_data[1] in listrec):
                continue
            nodes11, children1, nodes22, children2, _ = getData_id_type(
                test_data, dictt, embeddings)
            label = test_data[2]
            k += 1
            output = sess.run(
                [res],
                feed_dict={
                    nodes_node1: nodes11,
                    children_node1: children1,
                    nodes_node2: nodes22,
                    children_node2: children2,
                })
            correct_labels_test.append(int(label))
            threaholder = -1.0
            for i in range(0, 20):
                if output[0] >= threaholder:
                    predictions_test[i].append(1)
                else:
                    predictions_test[i].append(-1)
                threaholder += 0.1

            with open(
                    "dataset/cluster/4/" + name + "/" +
                    test_data[0].split('_')[0] + '_' +
                    test_data[1].split('_')[0] + '.txt', 'a') as fout:
                fout.writelines(test_data[2] + '\t' + test_data[0] + '\t' +
                                test_data[1] + '\t' + str(output[0]) + '\n')

        # The choice of the threshold will not affect the clustering results
        # We just investigate the max accuracy of our model's prediction of the relationship between chunks
        maxstep = 0
        maxaccuracy = 0
        for i in range(0, 20):
            accuracy = accuracy_score(correct_labels_test, predictions_test[i])
            if accuracy > maxaccuracy:
                maxaccuracy = accuracy
                maxstep = i
        threaholder = -1.0 + maxstep * 0.1
        cm = confusion_matrix(correct_labels_test,
                              predictions_test[maxstep],
                              labels=[-1, 1])
        tn, fp, fn, tp = cm.ravel()
        accuracy = accuracy_score(correct_labels_test,
                                  predictions_test[maxstep])
        print("threaholder: " + str(threaholder))
        print("combine precision_test:", tp / (tp + fp))
        print("combine recall_test:", tp / (tp + fn))
        print("seperate precision_test:", tn / (tn + fn))
        print("seperate recall_test:", tn / (tn + fp))
        print("accuracy_test:" + str(accuracy))
        print("tp:" + str(tp) + " tn:" + str(tn) + " fp:" + str(fp) + " fn:" +
              str(fn))
        # print(learn_rate_var)
        if name == "all":
            with open("dataset/cluster/4/total_result.txt",
                      'a') as total_result:
                total_result.writelines(
                    str(tp) + '\t' + str(tn) + '\t' + str(fp) + '\t' +
                    str(fn) + '\n')
        ff.close()
    print('---------------------------------------------')
コード例 #5
0
def test_model(test_dataloader, embeddings, embedding_lookup, opt):

    logdir = opt.model_path
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    test_left_trees = test_dataloader.left_trees
    test_right_trees = test_dataloader.right_trees
    test_labels = test_dataloader.labels

    n_classess = 2

    num_feats = len(embeddings[0])

    initializer = tf.contrib.layers.xavier_initializer()
    weights = {
        "w_t":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_t"),
        "w_l":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_l"),
        "w_r":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_r"),
        "w_attention":
        tf.Variable(initializer([opt.feature_size, 1]), name="w_attention")
    }

    biases = {
        "b_conv": tf.Variable(initializer([
            opt.feature_size,
        ]), name="b_conv"),
    }

    left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_2(
        num_feats,
        opt.feature_size,
        weights,
        biases,
    )

    out_node = network.out_layer(logits_node)

    labels_node, loss_node = network.loss_layer(logits_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    sess = tf.Session()

    # sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))
        # else:
        #     raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    print("Begin computing accuracy....")

    correct_labels = []
    predictions = []

    for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(
            test_left_trees, test_right_trees, test_labels, embeddings,
            embedding_lookup, opt.train_batch_size, opt.batch_type):

        left_nodes, left_children, left_masks = batch_left_trees
        right_nodes, right_children, right_masks = batch_right_trees

        labels_one_hot = convert_labels_to_one_hot(batch_labels)

        matching_matrices, output = sess.run(
            [attention_matrix_nodes, out_node],
            feed_dict={
                left_nodes_node: left_nodes,
                left_children_node: left_children,
                right_nodes_node: right_nodes,
                right_children_node: right_children,
                labels_node: labels_one_hot,
                left_mask_nodes: left_masks,
                right_mask_nodes: right_masks,
            })

        for i, matrix in enumerate(matching_matrices):
            np.savetxt("matching_matrix/" + str(i) + ".csv",
                       matrix,
                       delimiter=",")
        print(output)
        print(output.shape)
        correct = np.argmax(labels_one_hot, axis=1)
        predicted = np.argmax(output, axis=1)

        correct_labels.extend(correct)
        predictions.extend(predicted)

    print('Accuracy:', accuracy_score(correct_labels, predictions))
    print(classification_report(correct_labels, predictions))
    print(confusion_matrix(correct_labels, predictions))
コード例 #6
0
def train_model(logdir,
                infile,
                embedfile,
                epochs=EPOCHS,
                training="True",
                testing="True"):
    """Train a classifier to label ASTs"""

    print("Loading trees...")
    with open(infile, 'rb') as fh:
        trees, test_trees, labels = pickle.load(fh)

        random.shuffle(trees)

    print(labels)
    print("Loading embeddings....")
    with open(embedfile, 'rb') as fh:
        embeddings, embed_lookup = pickle.load(fh)
        num_feats = len(embeddings[0])

    # build the inputs and outputs of the network
    nodes_node, children_node, hidden_node = network.init_net(
        num_feats, len(labels))

    out_node = network.out_layer(hidden_node)
    labels_node, loss_node = network.loss_layer(hidden_node, len(labels))

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    tf.summary.scalar('loss', loss_node)

    ### init the graph
    sess = tf.Session()  #config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(logdir, sess.graph)
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')

    if training == "True":
        print("Begin training..........")
        num_batches = len(trees) // BATCH_SIZE + (
            1 if len(trees) % BATCH_SIZE != 0 else 0)
        for epoch in range(1, epochs + 1):
            for i, batch in enumerate(
                    sampling.batch_samples(
                        sampling.gen_samples(trees, labels, embeddings,
                                             embed_lookup), BATCH_SIZE)):
                nodes, children, batch_labels = batch
                step = (epoch - 1) * num_batches + i * BATCH_SIZE

                if not nodes:
                    continue  # don't try to train on an empty batch
                # print(batch_labels)
                _, summary, err, out = sess.run(
                    [train_step, summaries, loss_node, out_node],
                    feed_dict={
                        nodes_node: nodes,
                        children_node: children,
                        labels_node: batch_labels
                    })

                print('Epoch:', epoch, 'Step:', step, 'Loss:', err,
                      'Max nodes:', len(nodes[0]))

                writer.add_summary(summary, step)
                if step % CHECKPOINT_EVERY == 0:
                    # save state so we can resume later
                    saver.save(sess, os.path.join(checkfile), step)
                    print('Checkpoint saved, epoch:' + str(epoch) +
                          ', step: ' + str(step) + ', loss: ' + str(err) + '.')

        saver.save(sess, os.path.join(checkfile), step)

    # compute the training accuracy
    if testing == "True":
        correct_labels = []
        predictions = []
        print('Computing training accuracy...')
        for batch in sampling.batch_samples(
                sampling.gen_samples(test_trees, labels, embeddings,
                                     embed_lookup), 1):
            nodes, children, batch_labels = batch
            output = sess.run([out_node],
                              feed_dict={
                                  nodes_node: nodes,
                                  children_node: children,
                              })
            #print(output)
            correct_labels.append(np.argmax(batch_labels))
            predictions.append(np.argmax(output))

        target_names = list(labels)
        print('Accuracy:', accuracy_score(correct_labels, predictions))
        print(
            classification_report(correct_labels,
                                  predictions,
                                  target_names=target_names))
        print(confusion_matrix(correct_labels, predictions))
コード例 #7
0
def main():

    print("Loading embeddings....")
    with open(opt.embeddings_directory, 'rb') as fh:
        embeddings, embed_lookup = pickle.load(fh, encoding='latin1')

    logdir = opt.model_path
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    n_classess = 2

    num_feats = len(embeddings[0])

    initializer = tf.contrib.layers.xavier_initializer()
    weights = {
        "w_t":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_t"),
        "w_l":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_l"),
        "w_r":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_r"),
        "w_attention":
        tf.Variable(initializer([opt.feature_size, 1]), name="w_attention")
    }

    biases = {
        "b_conv": tf.Variable(initializer([
            opt.feature_size,
        ]), name="b_conv"),
    }

    left_nodes_node, left_children_node, right_nodes_node, right_children_node, hidden_node, left_score_node, right_score_node = network.init_net_for_siamese(
        num_feats, opt.feature_size, weights, biases, opt.aggregation,
        opt.distributed_function)

    out_node = network.out_layer(hidden_node)

    labels_node, loss_node = network.loss_layer(hidden_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    sess = tf.Session()

    # sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))
        # else:
        #     raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    print("Begin computing accuracy....")

    test_file = opt.test_data

    # with open(test_file, "r") as f:
    #     data = f.readlines()
    #     for line in data:
    #         line = line.replace("\n","")

    # test_dataloader = CrossLanguageProgramDataForLiveTest(test_file, 1,opt.n_classes)
    # test_left_trees = test_dataloader.left_trees
    # test_right_trees = test_dataloader.right_trees
    # test_labels = test_dataloader.labels
    all_pairs_index = []
    with open(opt.test_data, "r") as f:
        data = f.readlines()
        for line in data:
            print(line)
            all_pairs_index.append(line.replace("\n", ""))

    for i, pair in tqdm(enumerate(all_pairs_index)):
        splits = pair.split(",")
        left_path = splits[0]
        right_path = splits[1]
        label = splits[2]

        pairs, left_node_ids_list, right_node_ids_list = load_single_pair_for_live_test(
            left_path, right_path, label)
        predict(sess, i, left_path, right_path, pairs, left_node_ids_list,
                right_node_ids_list, out_node, left_score_node,
                right_score_node, left_nodes_node, left_children_node,
                right_nodes_node, right_children_node, labels_node, embeddings,
                embed_lookup, opt)
コード例 #8
0
def train_model(logdir,
                inputs,
                embeddings_list_url,
                node_map_url,
                epochs=EPOCHS,
                with_drop_out=1,
                device="0"):
    os.environ['CUDA_VISIBLE_DEVICES'] = device

    print("Using device : " + device)
    print("Batch size : " + str(BATCH_SIZE))
    if int(with_drop_out) == 1:
        print("Training with drop out rate : " + str(DROP_OUT))
    n_classess = 2

    print("Loading training data....")
    # print "Using device : " + device
    with open(inputs, "rb") as fh:
        # all_1_pairs, all_0_pairs = pickle.load(fh)
        all_training_pairs = pickle.load(fh)

    random.shuffle(all_training_pairs)
    print("Loading embedding list.....")
    with open(embeddings_list_url, "rb") as embeddings_list_fh:
        embeddings_list = pickle.load(embeddings_list_fh)

    num_feats = len(embeddings_list[0])
    print("number of features : " + str(num_feats))

    print("Loading node map for looking up.....")
    with open(node_map_url, "rb") as node_map_fh:
        # all_1_pairs, all_0_pairs = pickle.load(fh)
        node_map = pickle.load(node_map_fh)

    # build the inputs and outputs of the network
    left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese(
        num_feats)

    right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese(
        num_feats)
    # with tf.device(device):
    print("Left pooling shape : " + str(tf.shape(left_pooling_node)))
    print("Right pooling shape : " + str(tf.shape(right_pooling_node)))
    merge_node = tf.concat([left_pooling_node, right_pooling_node], -1)
    print(tf.shape(merge_node))
    hidden_node = network.hidden_layer(merge_node, 600, 300)
    if int(with_drop_out) == 1:
        hidden_node = tf.layers.dropout(hidden_node,
                                        rate=DROP_OUT,
                                        training=True)

    hidden_node = network.hidden_layer(hidden_node, 300, 100)

    if int(with_drop_out) == 1:
        hidden_node = tf.layers.dropout(hidden_node,
                                        rate=DROP_OUT,
                                        training=True)

    hidden_node = network.hidden_layer(hidden_node, 100, n_classess)

    if int(with_drop_out) == 1:
        hidden_node = tf.layers.dropout(hidden_node,
                                        rate=DROP_OUT,
                                        training=True)

    out_node = network.out_layer(hidden_node)

    labels_node, loss_node = network.loss_layer(hidden_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    # tf.summary.scalar('loss', loss_node)

    # correct_prediction = tf.equal(tf.argmax(out_node,1), tf.argmax(labels_node,1))
    # accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    ### init the graph
    # config = tf.ConfigProto(allow_soft_placement=True)
    # config.gpu_options.allocator_type = 'BFC'
    # config.gpu_options.per_process_gpu_memory_fraction = 0.9
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    config = tf.ConfigProto()
    # config.gpu_options.allocator_type ='BFC'
    # config.gpu_options.allow_growth = True
    # config.gpu_options.per_process_gpu_memory_fraction = 0.98

    sess = tf.Session(config=config)

    # sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(logdir, sess.graph)
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
        # else:
        #     raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    using_vector_lookup_left = False
    if os.path.isfile("/input/config.json"):
        file_handler = open(config_file, 'r')
        contents = json.load(file_handler)
        using_vector_lookup_left = contents[
            'using_vector_lookup_left'] == "false"

    print("Begin training....")

    # with tf.device(device):
    for epoch in range(1, epochs + 1):
        # sample_1_pairs = random.sample(all_1_pairs,1000)
        # sample_0_pairs = random.sample(all_0_pairs,1000)

        # sample_training_pairs = random.sample(all_training_pairs,6400)

        shuffle_left_trees, shuffle_right_trees, labels = get_trees_from_pairs(
            all_training_pairs)
        print("Len left:", len(shuffle_left_trees), "Len right:",
              len(shuffle_right_trees))
        for left_gen_batch, right_gen_batch, labels_batch in sampling.batch_random_samples_2_sides(
                shuffle_left_trees, shuffle_right_trees, embeddings_list,
                node_map, labels, BATCH_SIZE):
            print("----------------------------------------------------")
            print("Len of label batch : " + str(labels_batch))
            left_nodes, left_children = left_gen_batch

            right_nodes, right_children = right_gen_batch

            sim_labels, sim_labels_num = get_one_hot_similarity_label(
                labels_batch)
            # print("sim labels : " + str(sim_labels))

            _, err, out, merge, labs = sess.run(
                [train_step, loss_node, out_node, merge_node, labels_node],
                feed_dict={
                    left_nodes_node: left_nodes,
                    left_children_node: left_children,
                    right_nodes_node: right_nodes,
                    right_children_node: right_children,
                    labels_node: sim_labels
                })

            # print "hidden : " + str(loss)
            print('Epoch:', epoch, 'Steps:', steps, 'Loss:', err,
                  "True Label vs Predicted Label:", zip(labs, out))

            # print('Epoch:', epoch,'Steps:', steps,'Loss:', err)
            if steps % CHECKPOINT_EVERY == 0:
                # save state so we can resume later
                saver.save(sess, os.path.join(checkfile), steps)
                print('Checkpoint saved.')

            steps += 1
        steps = 0
コード例 #9
0
def test_model(test_trees, labels, embeddings, embedding_lookup, opt):
    logdir = opt.model_path
    batch_size = opt.train_batch_size
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    random.shuffle(test_trees)

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    ckpt = tf.train.get_checkpoint_state(logdir)

    initializer = tf.contrib.layers.xavier_initializer()
    weights = {
        "w_t":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_t"),
        "w_l":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_l"),
        "w_r":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_r"),
        "w_attention":
        tf.Variable(initializer([opt.feature_size, 1]), name="w_attention")
    }

    biases = {
        "b_conv": tf.Variable(initializer([
            opt.feature_size,
        ]), name="b_conv"),
    }

    nodes_node, children_node, hidden_node, attention_score_node = network.init_net(
        node_embedding_size, len(labels), opt.feature_size, weights, biases,
        opt.aggregation, opt.distributed_function)

    out_node = network.out_layer(hidden_node)
    labels_node, loss_node = network.loss_layer(hidden_node, len(labels))

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)

        correct_labels = []
        predictions = []
        print('Computing training accuracy...')
        for batch in sampling.batch_samples(
                sampling.gen_samples(test_trees, labels, embeddings,
                                     embedding_lookup), 1):
            nodes, children, batch_labels = batch
            output = sess.run([out_node],
                              feed_dict={
                                  nodes_node: nodes,
                                  children_node: children,
                              })

            # print(attention_score[0])
            # print(len(attention_score[0]))

            # print(output)
            correct_labels.append(np.argmax(batch_labels))
            predictions.append(np.argmax(output))

        target_names = list(labels)
        print('Accuracy:', accuracy_score(correct_labels, predictions))
        print(
            classification_report(correct_labels,
                                  predictions,
                                  target_names=target_names))
        print(confusion_matrix(correct_labels, predictions))
コード例 #10
0
def test_model(test_dataloader, embeddings, embedding_lookup, opt):

    logdir = opt.model_path
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    test_left_trees = test_dataloader.left_trees
    test_right_trees = test_dataloader.right_trees
    test_labels = test_dataloader.labels
    test_left_node_ids_list = test_dataloader.left_node_ids_list
    test_right_node_ids_list = test_dataloader.right_node_ids_list

    print(test_left_node_ids_list)
    print("Num id left : " + str(len(test_left_node_ids_list[0])))
    print("Num id right : " + str(len(test_right_node_ids_list[0])))
    n_classess = 2

    num_feats = len(embeddings[0])

    initializer = tf.contrib.layers.xavier_initializer()
    weights = {
        "w_t":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_t"),
        "w_l":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_l"),
        "w_r":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_r"),
        "w_attention":
        tf.Variable(initializer([opt.feature_size, 1]), name="w_attention")
    }

    biases = {
        "b_conv": tf.Variable(initializer([
            opt.feature_size,
        ]), name="b_conv"),
    }

    left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_2(
        num_feats,
        opt.feature_size,
        weights,
        biases,
    )

    # left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_3(
    #     num_feats,
    #     opt.feature_size
    # )

    out_node = network.out_layer(logits_node)

    labels_node, loss_node = network.loss_layer(logits_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    sess = tf.Session()

    # sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))
        # else:
        #     raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    print("Begin computing accuracy....")

    correct_labels = []
    predictions = []

    for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(
            test_left_trees, test_right_trees, test_labels, embeddings,
            embedding_lookup, opt.train_batch_size, opt.batch_type):

        left_nodes, left_children, left_masks = batch_left_trees
        right_nodes, right_children, right_masks = batch_right_trees

        labels_one_hot = convert_labels_to_one_hot(batch_labels)

        matching_matrices, output = sess.run(
            [attention_matrix_nodes, out_node],
            feed_dict={
                left_nodes_node: left_nodes,
                left_children_node: left_children,
                right_nodes_node: right_nodes,
                right_children_node: right_children,
                labels_node: labels_one_hot,
                left_mask_nodes: left_masks,
                right_mask_nodes: right_masks,
            })
        matrix = matching_matrices[0]

        if len(test_left_node_ids_list[0]) < len(test_right_node_ids_list[0]):
            matrix = matrix[:len(test_left_node_ids_list[0]), ...]
        else:
            matrix = matrix[..., :len(test_right_node_ids_list[0])]

        matrix_pd = pd.DataFrame(data=matrix,
                                 index=test_left_node_ids_list[0],
                                 columns=test_right_node_ids_list[0])
        matrix_pd.to_csv("live_test/github_pairwise_java/sort/1_matrix.csv",
                         sep=",")

        left_matrix_aggregate_idx = matrix_pd.idxmax(axis=1)
        left_matrix_aggregate_idx.to_csv(
            "live_test/github_pairwise_java/sort/left_aggregate_attention_idx.csv",
            sep=",")

        right_matrix_aggregate_idx = matrix_pd.idxmax(axis=0)
        right_matrix_aggregate_idx.to_csv(
            "live_test/github_pairwise_java/sort/right_aggregate_attention_idx.csv",
            sep=",")

        left_matrix_aggregate = matrix_pd.max(axis=1)
        left_matrix_aggregate.to_csv(
            "live_test/github_pairwise_java/sort/left_aggregate_attention.csv",
            sep=",")
        left_matrix_max_dict = left_matrix_aggregate.to_dict()

        right_matrix_aggregate = matrix_pd.max(axis=0)
        right_matrix_aggregate.to_csv(
            "live_test/github_pairwise_java/sort/right_aggregate_attention.csv",
            sep=",")
        right_matrix_max_dict = right_matrix_aggregate.to_dict()

        left_scaled_attention_map = scale_attention(left_matrix_max_dict)
        right_scaled_attention_map = scale_attention(right_matrix_max_dict)

        left_attention_path = "live_test/github_pairwise_java/sort/left_attention_scaled.csv"
        left_pb_path = "github_java_sort_function_pb/5/3.java.pb"
        left_normal_html_path = "live_test/github_pairwise_java/sort/left_normal.html"
        with open(
                "live_test/github_pairwise_java/sort/left_attention_scaled.csv",
                "w") as f1:
            for key, score in enumerate(left_scaled_attention_map):
                line = str(key) + "," + str(score)
                f1.write("%s\n" % line)

        normal_cmd = "docker run --rm -v $(pwd):/e -it yijun/fast -H 0 -t -x " + left_attention_path + " " + left_pb_path + " > " + left_normal_html_path
        print(normal_cmd)
        os.system(normal_cmd)

        right_attention_path = "live_test/github_pairwise_java/sort/right_attention_scaled.csv"
        right_pb_path = "github_java_sort_function_pb/5/105.java.pb"
        right_normal_html_path = "live_test/github_pairwise_java/sort/right_normal.html"
        with open(
                "live_test/github_pairwise_java/sort/right_attention_scaled.csv",
                "w") as f1:
            for key, score in enumerate(right_scaled_attention_map):
                line = str(key) + "," + str(score)
                f1.write("%s\n" % line)

        normal_cmd = "docker run --rm -v $(pwd):/e -it yijun/fast -H 0 -t -x " + right_attention_path + " " + right_pb_path + " > " + right_normal_html_path
        print(normal_cmd)
        os.system(normal_cmd)

        print(output)
        print(labels_one_hot)
        correct = np.argmax(labels_one_hot, axis=1)
        predicted = np.argmax(output, axis=1)

        correct_labels.extend(correct)
        predictions.extend(predicted)

    print('Accuracy:', accuracy_score(correct_labels, predictions))
    print(classification_report(correct_labels, predictions))
    print(confusion_matrix(correct_labels, predictions))
コード例 #11
0
def train_model(train_trees, val_trees, labels, embeddings, embedding_lookup,
                opt):
    """Train a classifier to label ASTs"""

    logdir = opt.model_path
    batch_size = opt.train_batch_size
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    random.shuffle(train_trees)
    random.shuffle(val_trees)
    # random.shuffle(test_trees)

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    ckpt = tf.train.get_checkpoint_state(logdir)

    initializer = tf.contrib.layers.xavier_initializer()
    weights = {
        "w_t":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_t"),
        "w_l":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_l"),
        "w_r":
        tf.Variable(initializer([node_embedding_size, opt.feature_size]),
                    name="w_r"),
        "w_attention":
        tf.Variable(initializer([opt.feature_size, 1]), name="w_attention")
    }

    biases = {
        "b_conv": tf.Variable(initializer([
            opt.feature_size,
        ]), name="b_conv"),
    }

    nodes_node, children_node, hidden_node, attention_score_node = network.init_net(
        node_embedding_size, len(labels), opt.feature_size, weights, biases,
        opt.aggregation, opt.distributed_function)

    out_node = network.out_layer(hidden_node)
    labels_node, loss_node = network.loss_layer(hidden_node, len(labels))

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    saver = tf.train.Saver(save_relative_paths=True, max_to_keep=5)
    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()

    if opt.training:
        print("Begin training..........")

        with tf.Session() as sess:

            sess.run(init)

            if ckpt and ckpt.model_checkpoint_path:
                print("Continue training with old model")
                print("Checkpoint path : " + str(ckpt.model_checkpoint_path))
                saver.restore(sess, ckpt.model_checkpoint_path)
                for i, var in enumerate(saver._var_list):
                    print('Var {}: {}'.format(i, var))

            # saved_model.loader.load(sess, [tag_constants.TRAINING], savedmodel_path)

            num_batches = len(train_trees) // batch_size + (
                1 if len(train_trees) % batch_size != 0 else 0)
            for epoch in range(1, epochs + 1):
                for i, batch in enumerate(
                        sampling.batch_samples(
                            sampling.gen_samples(train_trees, labels,
                                                 embeddings, embedding_lookup),
                            batch_size)):
                    nodes, children, batch_labels = batch
                    # print(len(batch_labels))
                    # print(len(batch_labels[0]))
                    step = (epoch - 1) * num_batches + i * BATCH_SIZE

                    if not nodes:
                        continue  # don't try to train on an empty batch
                    # print(batch_labels)
                    _, err, out = sess.run(
                        [train_step, loss_node, out_node],
                        feed_dict={
                            nodes_node: nodes,
                            children_node: children,
                            labels_node: batch_labels
                        })

                    print('Epoch:', epoch, 'Step:', step, 'Loss:', err,
                          'Max nodes:', len(nodes[0]))

                    if step % CHECKPOINT_EVERY == 0:
                        # save state so we can resume later
                        saver.save(sess, checkfile)
                        # shutil.rmtree(savedmodel_path)

                        print('Checkpoint saved, epoch:' + str(epoch) +
                              ', step: ' + str(step) + ', loss: ' + str(err) +
                              '.')

                correct_labels = []
                predictions = []
                for batch in sampling.batch_samples(
                        sampling.gen_samples(val_trees, labels, embeddings,
                                             embedding_lookup), 1):
                    nodes, children, batch_labels = batch
                    output = sess.run([out_node],
                                      feed_dict={
                                          nodes_node: nodes,
                                          children_node: children,
                                      })
                    # print(output)
                    correct_labels.append(np.argmax(batch_labels))
                    predictions.append(np.argmax(output))

                target_names = list(labels)
                print('Accuracy:', accuracy_score(correct_labels, predictions))
                print(
                    classification_report(correct_labels,
                                          predictions,
                                          target_names=target_names))
                print(confusion_matrix(correct_labels, predictions))

            print("Finish all iters, storring the whole model..........")
            saver.save(sess, checkfile)
コード例 #12
0
def test_model(logdir,
               inputs,
               embeddings_list_url,
               node_map_url,
               epochs=EPOCHS):
    """Train a classifier to label ASTs"""

    n_classess = 2

    print("Loading embedding list.....")
    with open(embeddings_list_url, "rb") as embeddings_list_fh:
        embeddings_list = pickle.load(embeddings_list_fh)

    num_feats = len(embeddings_list[0])
    print("number of features : " + str(num_feats))

    print("Loading node map for looking up.....")
    with open(node_map_url, "rb") as node_map_fh:
        # all_1_pairs, all_0_pairs = pickle.load(fh)
        node_map = pickle.load(node_map_fh)

    # build the inputs and outputs of the network
    left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese(
        num_feats)

    right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese(
        num_feats)

    merge_node = tf.concat([left_pooling_node, right_pooling_node], -1)

    hidden_node = network.hidden_layer(merge_node, 600, 300)
    # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False)

    hidden_node = network.hidden_layer(hidden_node, 300, 100)
    # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False)

    hidden_node = network.hidden_layer(hidden_node, 100, n_classess)

    out_node = network.out_layer(hidden_node)

    labels_node, loss_node = network.loss_layer(hidden_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    # tf.summary.scalar('loss', loss_node)

    ### init the graph
    sess = tf.Session()  #config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    correct_labels = []
    predictions = []
    print('Computing testing accuracy...')

    with open(
            inputs,
            "rb",
    ) as csvfile:
        # with codecs.open("data/test.csv", "r", encoding = "utf-8", errors = 'replace') as csvfile:
        test_data_reader = csv.DictReader(csvfile, delimiter=',')
        for row in test_data_reader:

            print("----------------------")
            print(smart_str(row['test_id']))
            print(smart_str(row['question1']))
            print(smart_str(row['question2']))
            try:
                left_tree, right_tree = get_trees(smart_str(row['question1']),
                                                  smart_str(row['question2']))
                left_nodes, left_children, right_nodes, right_children = sampling.patch_data(
                    left_tree, right_tree, embeddings_list, node_map)
                # for left_nodes, left_children, right_nodes, right_children in sampling.patch_data(left_tree, right_tree, embeddings_list, node_map):

                # left_nodes, left_children = left_gen_batch

                # right_nodes, right_children = right_gen_batch

                output = sess.run(
                    [out_node],
                    feed_dict={
                        left_nodes_node: left_nodes,
                        left_children_node: left_children,
                        right_nodes_node: right_nodes,
                        right_children_node: right_children
                    })
                print(output)
                predicted = np.argmax(output[0])
                print(predicted)
                with open("data/predict_proba2.csv", "a") as f2:
                    f2.write(row['test_id'] + "," +
                             str(format(output[0][0][1], "f")) + "\n")
            except Exception as e:
                print "Error : " + str(e)
                with open("data/predict_proba2.csv", "a") as f2:
                    f2.write(row['test_id'] + "," + "0" + "\n")
コード例 #13
0
def train_model(train_trees, val_trees, labels, embeddings, embedding_lookup,
                opt):
    max_acc = 0.0
    logdir = opt.model_path
    batch_size = opt.train_batch_size
    epochs = opt.niter
    num_feats = len(embeddings[0])

    random.shuffle(train_trees)

    nodes_node, children_node, codecaps_node = network.init_net_treecaps(
        num_feats, len(labels))

    codecaps_node = tf.identity(codecaps_node, name="codecaps_node")

    out_node = network.out_layer(codecaps_node)
    labels_node, loss_node = network.loss_layer(codecaps_node, len(labels))

    optimizer = RAdamOptimizer(opt.lr)
    train_step = optimizer.minimize(loss_node)

    ### init the graph
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

    checkfile = os.path.join(logdir, 'tree_network.ckpt')

    print("Begin training..........")
    num_batches = len(train_trees) // batch_size + (
        1 if len(train_trees) % batch_size != 0 else 0)
    for epoch in range(1, epochs + 1):
        bar = progressbar.ProgressBar(maxval=len(train_trees),
                                      widgets=[
                                          progressbar.Bar('=', '[', ']'), ' ',
                                          progressbar.Percentage()
                                      ])
        bar.start()
        for i, batch in enumerate(
                sampling.batch_samples(
                    sampling.gen_samples(train_trees, labels, embeddings,
                                         embedding_lookup), batch_size)):
            nodes, children, batch_labels = batch
            step = (epoch - 1) * num_batches + i * batch_size

            if not nodes:
                continue
            _, err, out = sess.run(
                [train_step, loss_node, out_node],
                feed_dict={
                    nodes_node: nodes,
                    children_node: children,
                    labels_node: batch_labels
                })
            bar.update(i + 1)
        bar.finish()

        correct_labels = []
        predictions = []
        logits = []
        for batch in sampling.batch_samples(
                sampling.gen_samples(val_trees, labels, embeddings,
                                     embedding_lookup), 1):
            nodes, children, batch_labels = batch
            output = sess.run([out_node],
                              feed_dict={
                                  nodes_node: nodes,
                                  children_node: children
                              })
            correct_labels.append(np.argmax(batch_labels))
            predictions.append(np.argmax(output))
            logits.append(output)

        target_names = list(labels)
        acc = accuracy_score(correct_labels, predictions)
        if (acc > max_acc):
            max_acc = acc
            saver.save(sess, checkfile)
            np.save(opt.model_path + '/logits', np.array(logits))
            np.save(opt.model_path + '/correct', np.array(correct_labels))

        print('Epoch', str(epoch), 'Accuracy:', acc, 'Max Acc: ', max_acc)
        csv_log.write(str(epoch) + ',' + str(acc) + ',' + str(max_acc) + '\n')

    print("Finish all iters, storring the whole model..........")
コード例 #14
0
ファイル: main.py プロジェクト: jhpenger/tbcnn-attention
def train_model(train_trees, val_trees, labels, embeddings, embedding_lookup,
                opt):
    """Train a classifier to label ASTs"""

    logdir = opt.model_path
    batch_size = opt.train_batch_size
    epochs = opt.niter
    num_feats = len(embeddings[0])

    random.shuffle(train_trees)
    random.shuffle(val_trees)

    nodes_node, children_node, hidden_node, attention_score_node = network.init_net(
        num_feats, len(labels), opt.aggregation)
    hidden_node = tf.identity(hidden_node, name="hidden_node")

    out_node = network.out_layer(hidden_node)
    labels_node, loss_node = network.loss_layer(hidden_node, len(labels))

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    ### init the graph
    sess = tf.Session()  #config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())

    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')

    print("Begin training..........")
    num_batches = len(train_trees) // batch_size + (
        1 if len(train_trees) % batch_size != 0 else 0)
    for epoch in range(1, epochs + 1):
        for i, batch in enumerate(
                sampling.batch_samples(
                    sampling.gen_samples(train_trees, labels, embeddings,
                                         embedding_lookup), batch_size)):
            nodes, children, batch_labels = batch
            step = (epoch - 1) * num_batches + i * BATCH_SIZE

            if not nodes:
                continue  # don't try to train on an empty batch
            # print(batch_labels)
            _, err, out = sess.run(
                [train_step, loss_node, out_node],
                feed_dict={
                    nodes_node: nodes,
                    children_node: children,
                    labels_node: batch_labels
                })

            print('Epoch:', epoch, 'Step:', step, 'Loss:', err, 'Max nodes:',
                  len(nodes[0]))
            # print(attention_score[0])
            # print(len(attention_score[0]))
            # print(pooling_output.shape)

            if step % CHECKPOINT_EVERY == 0:
                # save state so we can resume later
                saver.save(sess, checkfile)
                # shutil.rmtree(savedmodel_path)

                print('Checkpoint saved, epoch:' + str(epoch) + ', step: ' +
                      str(step) + ', loss: ' + str(err) + '.')

        correct_labels = []
        predictions = []
        for batch in sampling.batch_samples(
                sampling.gen_samples(val_trees, labels, embeddings,
                                     embedding_lookup), 1):
            nodes, children, batch_labels = batch
            output = sess.run([out_node],
                              feed_dict={
                                  nodes_node: nodes,
                                  children_node: children,
                              })
            # print(output)
            correct_labels.append(np.argmax(batch_labels))
            predictions.append(np.argmax(output))

        target_names = list(labels)
        print('Accuracy:', accuracy_score(correct_labels, predictions))
        print(
            classification_report(correct_labels,
                                  predictions,
                                  target_names=target_names))
        print(confusion_matrix(correct_labels, predictions))

    print("Finish all iters, storring the whole model..........")
    saver.save(sess, checkfile)
コード例 #15
0
def train_model(infile, embeddings, epochs=EPOCHS):
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    num_feats = 100
    nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_nofinetune(
        num_feats)
    labels_node, loss_node = network.loss_layer(res)

    optimizer = tf.train.GradientDescentOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(
        config=config)  #config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())
    dictt = {}
    listrec = []
    f = open("flistPOJ.txt", 'r')
    line = f.readline().rstrip('\t')
    l = line.split('\t')
    for ll in l:
        if not os.path.exists(ll):
            listrec.append(ll)
            continue
        tree = pycparser.parse_file(ll)
        sample, size = _traverse_tree_noast(tree)
        dictt[ll] = sample
    f.close()
    for epoch in range(1, epochs + 1):
        f = open(infile, 'r')
        line = "123"
        aaa = 0
        while line:
            line = f.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            if l[0] in listrec:
                continue
            if l[1] in listrec:
                continue
            nodes1, children1, nodes2, children2, batch_labels = getData_nofinetune(
                l, dictt, embeddings)
            _, err, r = sess.run(
                [train_step, loss_node, res],
                feed_dict={
                    nodes_node1: nodes1,
                    children_node1: children1,
                    nodes_node2: nodes2,
                    children_node2: children2,
                    labels_node: [batch_labels]
                })
            maxnodes = max(len(nodes1[0]), len(nodes2[0]))
            if aaa % 1000 == 0:
                print('Epoch:', epoch, 'Step:', aaa, 'Loss:', err, 'R:', r,
                      'Max nodes:', maxnodes)
            aaa += 1
        f.close()
        correct_labels_dev = []
        predictions_dev = []
        for reci in range(0, 15):
            predictions_dev.append([])
        ff = open("./datasetForVariantsTBCCD/POJ/devdata.txt", 'r')
        line = "123"
        k = 0
        while line:
            line = ff.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            k += 1

            nodes1, children1, nodes2, children2, batch_labels = getData_nofinetune(
                l, dictt, embeddings)

            output = sess.run(
                [res],
                feed_dict={
                    nodes_node1: nodes1,
                    children_node1: children1,
                    nodes_node2: nodes2,
                    children_node2: children2,
                })
            correct_labels_dev.append(int(l[2]))
            threaholder = -0.7
            for i in range(0, 15):
                if output[0] >= threaholder:
                    predictions_dev[i].append(1)
                else:
                    predictions_dev[i].append(-1)
                threaholder += 0.1
        maxf1value = -1.0
        for i in range(0, 15):
            f1score = f1_score(correct_labels_dev,
                               predictions_dev[i],
                               average='binary')
            if f1score > maxf1value:
                maxf1value = f1score
                maxstep = i
        ff.close()
        correct_labels_test = []
        predictions_test = []
        ff = open("./datasetForVariantsTBCCD/POJ/testdata.txt", 'r')
        line = "123"
        k = 0

        while line:
            line = ff.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            k += 1

            if (l[0] in listrec) or (l[1] in listrec):
                continue
            nodes1, children1, nodes2, children2, batch_labels = getData_nofinetune(
                l, dictt, embeddings)

            output = sess.run(
                [res],
                feed_dict={
                    nodes_node1: nodes1,
                    children_node1: children1,
                    nodes_node2: nodes2,
                    children_node2: children2,
                })
            correct_labels_test.append(int(l[2]))
            threaholder = -0.7 + maxstep * 0.1
            if output[0] >= threaholder:
                predictions_test.append(1)
            else:
                predictions_test.append(-1)
        ff.close()

        print("starttest:\n")
        print("threaholder:")
        print(threaholder)
        p = precision_score(correct_labels_test,
                            predictions_test,
                            average='binary')
        r = recall_score(correct_labels_test,
                         predictions_test,
                         average='binary')
        f1score = f1_score(correct_labels_test,
                           predictions_test,
                           average='binary')
        print("recall_test:" + str(r))
        print("precision_test:" + str(p))
        print("f1score_test:" + str(f1score))
コード例 #16
0
ファイル: test_bitbcnn.py プロジェクト: stjordanis/bi-tbcnn
def test_model(logdir, inputs, left_embedfile, right_embedfile, epochs=EPOCHS):
    """Train a classifier to label ASTs"""

    n_classess = 2
    # left_algo_labels = ['mergesort', 'linkedlist', 'quicksort', 'bfs', 'bubblesort', 'knapsack']
    # right_algo_labels = ['mergesort', 'linkedlist', 'quicksort', 'bfs', 'bubblesort', 'knapsack']
    left_algo_labels = [
        "bfs", "bubblesort", "knapsack", "linkedlist", "mergesort",
        "quicksort", "heap", "dfs", "stack", "queue"
    ]
    right_algo_labels = [
        "bfs", "bubblesort", "knapsack", "linkedlist", "mergesort",
        "quicksort", "heap", "dfs", "stack", "queue"
    ]
    # with open(left_inputs, 'rb') as fh:
    #     _, left_trees, left_algo_labels = pickle.load(fh)

    # with open(right_inputs, 'rb') as fh:
    #     _, right_trees, right_algo_labels = pickle.load(fh)
    with open(inputs, "rb") as fh:
        testing_pairs = pickle.load(fh)
    print "Loading embdding vectors...."
    with open(left_embedfile, 'rb') as fh:
        left_embeddings, left_embed_lookup = pickle.load(fh)

    with open(right_embedfile, 'rb') as fh:
        right_embeddings, right_embed_lookup = pickle.load(fh)

    num_feats = len(left_embeddings[0])

    # build the inputs and outputs of the network
    left_nodes_node, left_children_node, left_pooling_node = network.init_net_for_siamese(
        num_feats)

    right_nodes_node, right_children_node, right_pooling_node = network.init_net_for_siamese(
        num_feats)

    merge_node = tf.concat([left_pooling_node, right_pooling_node], -1)

    hidden_node = network.hidden_layer(merge_node, 200, 200)
    # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False)

    hidden_node = network.hidden_layer(hidden_node, 200, 200)
    # hidden_node = tf.layers.dropout(hidden_node, rate=0.2, training=False)

    hidden_node = network.hidden_layer(hidden_node, 200, n_classess)

    out_node = network.out_layer(hidden_node)

    labels_node, loss_node = network.loss_layer(hidden_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    # tf.summary.scalar('loss', loss_node)

    ### init the graph
    sess = tf.Session()  #config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    left_trees, right_trees = get_trees_from_pairs(testing_pairs)

    using_vector_lookup_left = False
    if os.path.isfile("/input/config.json"):
        file_handler = open(config_file, 'r')
        contents = json.load(file_handler)
        using_vector_lookup_left = contents[
            'using_vector_lookup_left'] == "false"

    correct_labels = []
    predictions = []
    print('Computing testing accuracy...')
    for left_gen_batch, right_gen_batch in sampling.batch_random_samples_2_sides(
            left_trees, left_algo_labels, right_trees, right_algo_labels,
            left_embeddings, left_embed_lookup, right_embeddings,
            right_embed_lookup, using_vector_lookup_left, False,
            TEST_BATCH_SIZE):
        left_nodes, left_children, left_labels_one_hot, left_labels = left_gen_batch

        right_nodes, right_children, right_labels_one_hot, right_labels = right_gen_batch
        sim_labels, _ = get_one_hot_similarity_label(left_labels, right_labels)
        print("sim labels : " + str(sim_labels))
        output = sess.run(
            [out_node],
            feed_dict={
                left_nodes_node: left_nodes,
                left_children_node: left_children,
                right_nodes_node: right_nodes,
                right_children_node: right_children,
                labels_node: sim_labels
            })
        correct = np.argmax(sim_labels[0])
        predicted = np.argmax(output[0])
        check = (correct == predicted) and True or False
        print('Out:', output, "Status:", check)
        correct_labels.append(np.argmax(sim_labels[0]))
        predictions.append(np.argmax(output[0]))

    target_names = ["0", "1"]
    print('Accuracy:', accuracy_score(correct_labels, predictions))
    print(
        classification_report(correct_labels,
                              predictions,
                              target_names=target_names))
    print(confusion_matrix(correct_labels, predictions))
コード例 #17
0
def train_model(train_dataloader, val_dataloader, embeddings, embedding_lookup,
                opt):

    logdir = opt.model_path
    epochs = opt.niter
    node_embedding_size = len(embeddings[0])

    train_left_trees = train_dataloader.left_trees
    train_right_trees = train_dataloader.right_trees
    train_labels = train_dataloader.labels

    val_left_trees = val_dataloader.left_trees
    val_right_trees = val_dataloader.right_trees
    val_labels = val_dataloader.labels

    n_classess = 2

    num_feats = len(embeddings[0])

    initializer = tf.contrib.layers.xavier_initializer()

    left_nodes_node, left_children_node, right_nodes_node, right_children_node, logits_node, left_mask_nodes, right_mask_nodes, attention_matrix_nodes = network.init_net_for_siamese_3(
        num_feats, opt.feature_size)

    out_node = network.out_layer(logits_node)

    labels_node, loss_node = network.loss_layer(logits_node, n_classess)

    optimizer = tf.train.AdamOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    sess = tf.Session()

    # sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))
        # else:
        #     raise 'Checkpoint not found.'

    checkfile = os.path.join(logdir, 'cnn_tree.ckpt')
    steps = 0

    print("Begin training....")

    # with tf.device(device):
    # temp_precision = 0.0
    # temp_recall = 0.0
    temp_accuracy = 0.0
    for epoch in range(1, epochs + 1):
        for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(
                train_left_trees, train_right_trees, train_labels, embeddings,
                embedding_lookup, opt.train_batch_size, opt.batch_type):

            left_nodes, left_children, left_masks = batch_left_trees
            right_nodes, right_children, right_masks = batch_right_trees

            labels_one_hot = convert_labels_to_one_hot(batch_labels)

            _, err, left_nodes_out, out = sess.run(
                [train_step, loss_node, left_nodes_node, out_node],
                feed_dict={
                    left_nodes_node: left_nodes,
                    left_children_node: left_children,
                    right_nodes_node: right_nodes,
                    right_children_node: right_children,
                    labels_node: labels_one_hot,
                    left_mask_nodes: left_masks,
                    right_mask_nodes: right_masks,
                })

            print('Epoch:', epoch, 'Steps:', steps, 'Loss:', err,
                  "Val Accuracy:", temp_accuracy)
            print(left_nodes_out.shape)

            if steps % CHECKPOINT_EVERY == 0:
                print("Checkpoint, validating.....")

                correct_labels = []
                predictions = []

                for batch_left_trees, batch_right_trees, batch_labels in sampling.batch_random_samples_2_sides(
                        val_left_trees, val_right_trees, val_labels,
                        embeddings, embedding_lookup, opt.train_batch_size,
                        opt.batch_type):

                    left_nodes, left_children, left_masks = batch_left_trees
                    right_nodes, right_children, right_masks = batch_right_trees

                    labels_one_hot = convert_labels_to_one_hot(batch_labels)

                    output = sess.run(
                        [out_node],
                        feed_dict={
                            left_nodes_node: left_nodes,
                            left_children_node: left_children,
                            right_nodes_node: right_nodes,
                            right_children_node: right_children,
                            labels_node: labels_one_hot,
                            left_mask_nodes: left_masks,
                            right_mask_nodes: right_masks,
                        })

                    correct = np.argmax(labels_one_hot, axis=1)
                    predicted = np.argmax(output[0], axis=1)

                    correct_labels.extend(correct)
                    predictions.extend(predicted)

                accuracy = float(accuracy_score(correct_labels, predictions))
                precision = float(precision_score(correct_labels, predictions))
                recall = float(recall_score(correct_labels, predictions))
                print('Accuracy:', accuracy)
                print(classification_report(correct_labels, predictions))
                print(confusion_matrix(correct_labels, predictions))

                if accuracy > temp_accuracy:
                    temp_accuracy = accuracy
                    with open("no_tbcnn_validation.txt", "w") as f:
                        f.write(str(temp_accuracy))
                    # save state so we can resume later
                    saver.save(sess, os.path.join(checkfile), steps)
                    print('Checkpoint saved.')

            steps += 1
        steps = 0
コード例 #18
0
def train_model(infile, embeddings):
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    num_feats = len(getWordEmd('ForStatement'))
    nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_nofinetune(
        num_feats)
    labels_node, loss_node = network.loss_layer(res)
    optimizer = tf.train.GradientDescentOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(
        config=config)  # config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())
    dictt = {}
    listrec = []
    f = open("flistBCB.txt", 'r')
    line = f.readline().rstrip('\t')
    l = line.split('\t')
    z = 0
    for ll in l:
        if not os.path.exists(ll):
            listrec.append(ll)
            continue
        faa = open(ll, 'r', encoding="utf-8")
        fff = faa.read()
        tree = javalang.parse.parse_member_signature(fff)
        sample, size = _traverse_treewithid(tree)
        if size > 3000 or size < 10:
            z += 1
            listrec.append(ll)
            continue
        dictt[ll] = sample
    f.close()
    for epoch in range(1, EPOCHS + 1):
        f = open(infile, 'r')
        line = "123"
        k = 0
        while line:
            line = f.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            k += 1
            if (l[0] in listrec) or (l[1] in listrec):
                continue
            batch_labels = []
            nodes1, children1, nodes2, children2, la = getData_nofinetune(
                l, dictt, embeddings)
            batch_labels.append(la)
            _, err, r = sess.run(
                [train_step, loss_node, res],
                feed_dict={
                    nodes_node1: nodes1,
                    children_node1: children1,
                    nodes_node2: nodes2,
                    children_node2: children2,
                    labels_node: batch_labels
                })
            maxnodes = max(len(nodes1[0]), len(nodes2[0]))
            if k % 1000 == 0:
                print('Epoch:', epoch, 'Step:', k, 'Loss:', err, 'R:', r,
                      'Max nodes:', maxnodes)
        f.close()
        correct_labels_dev = []
        predictions_dev = []
        for reci in range(0, 15):
            predictions_dev.append([])
        ff = open("./datasetForVariantsTBCCD/BCB/devdata.txt", 'r')
        line = "123"
        k = 0
        while line:
            line = ff.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            if (l[0] in listrec) or (l[1] in listrec):
                continue
            batch_labels = []
            nodes1, children1, nodes2, children2, la = getData_nofinetune(
                l, dictt, embeddings)
            batch_labels.append(la)
            k += 1
            output = sess.run(
                [res],
                feed_dict={
                    nodes_node1: nodes1,
                    children_node1: children1,
                    nodes_node2: nodes2,
                    children_node2: children2,
                })
            correct_labels_dev.append(int(batch_labels[0]))
            threaholder = -0.7
            for i in range(0, 15):
                if output[0] >= threaholder:
                    predictions_dev[i].append(1)
                else:
                    predictions_dev[i].append(-1)
                threaholder += 0.1
        maxstep = 0
        maxf1value = 0
        for i in range(0, 15):
            f1score = f1_score(correct_labels_dev,
                               predictions_dev[i],
                               average='binary')
            if f1score > maxf1value:
                maxf1value = f1score
                maxstep = i
        ff.close()
        correct_labels_test = []
        predictions_test = []
        ff = open("./datasetForVariantsTBCCD/BCB/testdata.txt", 'r')
        line = "123"
        k = 0
        print("starttest:")
        while line:
            line = ff.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            k += 1
            if (l[0] in listrec) or (l[1] in listrec):
                continue
            batch_labels = []
            nodes1, children1, nodes2, children2, la = getData_nofinetune(
                l, dictt, embeddings)
            batch_labels.append(la)
            output = sess.run(
                [res],
                feed_dict={
                    nodes_node1: nodes1,
                    children_node1: children1,
                    nodes_node2: nodes2,
                    children_node2: children2,
                })
            k += 1
            correct_labels_test.append(int(batch_labels[0]))
            threaholderr = -0.7 + maxstep * 0.1
            if output[0] >= threaholderr:
                predictions_test.append(1)
            else:
                predictions_test.append(-1)
        ff.close()
        print("testdata\n")
        print("threa:")
        print(threaholderr)
        p = precision_score(correct_labels_test,
                            predictions_test,
                            average='binary')
        r = recall_score(correct_labels_test,
                         predictions_test,
                         average='binary')
        f1score = f1_score(correct_labels_test,
                           predictions_test,
                           average='binary')
        print("recall_test:" + str(r))
        print("precision_test:" + str(p))
        print("f1score_test:" + str(f1score))
        ff.close()
コード例 #19
0
ファイル: main.py プロジェクト: bdqnghi/treecaps
def train_model(train_trees, val_trees, labels, embedding_lookup, opt):
    max_acc = 0.0
    logdir = opt.model_path
    batch_size = opt.train_batch_size
    epochs = opt.niter
    
    random.shuffle(train_trees)
    
    nodes_node, children_node, codecaps_node = network.init_net_treecaps(50, embedding_lookup, len(labels))

    codecaps_node = tf.identity(codecaps_node, name="codecaps_node")

    out_node = network.out_layer(codecaps_node)
    labels_node, loss_node = network.loss_layer(codecaps_node, len(labels))

    optimizer = RAdamOptimizer(opt.lr)
    train_point = optimizer.minimize(loss_node)
    
     ### init the graph
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()

    with tf.name_scope('saver'):
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(logdir)
        if ckpt and ckpt.model_checkpoint_path:
            print("Continue training with old model")
            saver.restore(sess, ckpt.model_checkpoint_path)
            for i, var in enumerate(saver._var_list):
                print('Var {}: {}'.format(i, var))

    checkfile = os.path.join(logdir, 'tree_network.ckpt')

    print("Begin training..........")
    num_batches = len(train_trees) // batch_size + (1 if len(train_trees) % batch_size != 0 else 0)
    max_acc = 0.0
    for epoch in range(1, epochs+1):
       
        for train_step, train_batch in enumerate(sampling.batch_samples(
            sampling.gen_samples(train_trees, labels), batch_size
        )):
            nodes, children, batch_labels = train_batch
            # step = (epoch - 1) * num_batches + train_step * batch_size

            if not nodes:
                continue
            _, err, out = sess.run(
                [train_point, loss_node, out_node],
                feed_dict={
                    nodes_node: nodes,
                    children_node: children,
                    labels_node: batch_labels
                }
            )
         
            print("Epoch : ", str(epoch), "Step : ", train_step, "Loss : ", err, "Max Acc: ",max_acc)


            if train_step % 1000 == 0 and train_step > 0:
                correct_labels = []
                predictions = []
                # logits = []
                for test_batch in sampling.batch_samples(
                    sampling.gen_samples(val_trees, labels), batch_size
                ):
                    print("---------------")
                    nodes, children, batch_labels = test_batch
                    print(batch_labels)
                    output = sess.run([out_node],
                        feed_dict={
                            nodes_node: nodes,
                            children_node: children
                        }
                    )

                    batch_correct_labels = np.argmax(batch_labels, axis=1)
                    batch_predictions = np.argmax(output[0], axis=1)
                    correct_labels.extend(batch_correct_labels)
                    predictions.extend(batch_predictions)
                    # logits.append(output)

                    print(batch_correct_labels)
                    print(batch_predictions)

                acc = accuracy_score(correct_labels, predictions)
                if (acc>max_acc):
                    max_acc = acc
                    saver.save(sess, checkfile)
                    print("Saved checkpoint....")

                print('Epoch',str(epoch),'Accuracy:', acc, 'Max Acc: ',max_acc)
                csv_log.write(str(epoch)+','+str(acc)+','+str(max_acc)+'\n')

    print("Finish all iters, storring the whole model..........")