Exemple #1
0
def train_model(embeddings):
    dictt = {}
    listrec = []
    file_list = os.listdir('dataset/features/features2')
    z = 0
    for file in file_list:
        file_path = 'dataset/features/features2' + '/' + file
        if not os.path.exists(file_path):
            listrec.append(file)
            continue
        with open(file_path, 'r', encoding="utf-8") as faa:
            sample = json.loads(faa.read())
        if sample == "" or sample == " " or sample == "\n":
            z += 1
            listrec.append(file)
            continue
        dictt[file] = sample
    print("length of feature: " + str(len(dictt)))
    # print("invalid file number:", z)

    TrainDatalist = []
    file = open("dataset/dataset/train/" + data_type + ".txt", 'r')
    for line in file:
        if line == "" or line == " ":
            continue
        TrainDatalist.append(line)
    print('len ( TrainData ) = ', len(TrainDatalist))
    file.close()

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_finetune(
        feature_size, embeddingg, KERNEL)
    labels_node, loss_node = network.loss_layer(res)

    aaa = 0
    global_step = tf.Variable(0, trainable=False)
    learn_rate = tf.train.exponential_decay(LEARN_RATE, global_step,
                                            len(TrainDatalist), 0.9, True)
    optimizer = tf.train.GradientDescentOptimizer(learn_rate)
    train_step = optimizer.minimize(loss_node, global_step=global_step)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(
        config=config)  # config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())

    for global_step in range(1, EPOCHS + 1):
        k = 0
        random.shuffle(TrainDatalist)
        for line in TrainDatalist:
            line = line.rstrip('\n')
            train_data = line.split('\t')
            if len(train_data) != 3:
                continue
            k += 1
            if (train_data[0] in listrec) or (train_data[1] in listrec):
                continue
            nodes11, children1, nodes22, children2, batch_labels = getData_id_type(
                train_data, dictt, embeddings)
            _, err, r = sess.run(
                [train_step, loss_node, res],
                feed_dict={
                    nodes_node1: nodes11,
                    children_node1: children1,
                    nodes_node2: nodes22,
                    children_node2: children2,
                    labels_node: batch_labels,
                })
            learn_rate_var = sess.run(learn_rate)
            aaa += 1

    test_list = ['argouml', 'gwt', 'jruby', 'xstream', 'all']
    print('------------Model 4' + data_type + '------------')
    for name in test_list:
        print('start test: ' + name)
        correct_labels_test = []
        predictions_test = []
        for _ in range(0, 20):
            predictions_test.append([])

        ff = open("dataset/dataset/test/" + name + "/" + data_type + ".txt",
                  'r')
        line = "123"
        k = 0
        while line:
            line = ff.readline().rstrip('\n')
            test_data = line.split('\t')
            if len(test_data) != 3:
                continue
            if (test_data[0] in listrec) or (test_data[1] in listrec):
                continue
            nodes11, children1, nodes22, children2, _ = getData_id_type(
                test_data, dictt, embeddings)
            label = test_data[2]
            k += 1
            output = sess.run(
                [res],
                feed_dict={
                    nodes_node1: nodes11,
                    children_node1: children1,
                    nodes_node2: nodes22,
                    children_node2: children2,
                })
            correct_labels_test.append(int(label))
            threaholder = -1.0
            for i in range(0, 20):
                if output[0] >= threaholder:
                    predictions_test[i].append(1)
                else:
                    predictions_test[i].append(-1)
                threaholder += 0.1

            with open(
                    "dataset/cluster/4/" + name + "/" +
                    test_data[0].split('_')[0] + '_' +
                    test_data[1].split('_')[0] + '.txt', 'a') as fout:
                fout.writelines(test_data[2] + '\t' + test_data[0] + '\t' +
                                test_data[1] + '\t' + str(output[0]) + '\n')

        # The choice of the threshold will not affect the clustering results
        # We just investigate the max accuracy of our model's prediction of the relationship between chunks
        maxstep = 0
        maxaccuracy = 0
        for i in range(0, 20):
            accuracy = accuracy_score(correct_labels_test, predictions_test[i])
            if accuracy > maxaccuracy:
                maxaccuracy = accuracy
                maxstep = i
        threaholder = -1.0 + maxstep * 0.1
        cm = confusion_matrix(correct_labels_test,
                              predictions_test[maxstep],
                              labels=[-1, 1])
        tn, fp, fn, tp = cm.ravel()
        accuracy = accuracy_score(correct_labels_test,
                                  predictions_test[maxstep])
        print("threaholder: " + str(threaholder))
        print("combine precision_test:", tp / (tp + fp))
        print("combine recall_test:", tp / (tp + fn))
        print("seperate precision_test:", tn / (tn + fn))
        print("seperate recall_test:", tn / (tn + fp))
        print("accuracy_test:" + str(accuracy))
        print("tp:" + str(tp) + " tn:" + str(tn) + " fp:" + str(fp) + " fn:" +
              str(fn))
        # print(learn_rate_var)
        if name == "all":
            with open("dataset/cluster/4/total_result.txt",
                      'a') as total_result:
                total_result.writelines(
                    str(tp) + '\t' + str(tn) + '\t' + str(fp) + '\t' +
                    str(fn) + '\n')
        ff.close()
    print('---------------------------------------------')
def train_model(infile, embeddings):
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    num_feats = 100
    nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_finetune(num_feats,embeddingg)
    aaa = 1
    labels_node, loss_node = network.loss_layer(res)
    optimizer = tf.train.GradientDescentOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)  # config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())
    dictt = {}
    listrec = []
    f = open("flistBCB.txt", 'r')
    line = f.readline().rstrip('\t')
    l = line.split('\t')
    z = 0
    for ll in l:
        if not os.path.exists(ll):
            listrec.append(ll)
            continue
        faa = open(ll, 'r', encoding="utf-8")
        fff = faa.read()
        tree = javalang.parse.parse_member_signature(fff)
        sample, size = _traverse_treewithid(tree)
        if size > 3000 or size < 10:
            z += 1
            listrec.append(ll)
            continue
        dictt[ll] = sample
    f.close()
    print("wuxiaogeshu:", z)
    for epoch in range(1, EPOCHS + 1):
        f = open(infile, 'r')
        line = "123"
        k = 0
        while line:
            line = f.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            if (l[0] in listrec) or (l[1] in listrec):
                continue
            k += 1

            nodes11,children1,nodes22,children2,batch_labels=getData_finetune(l,dictt,embeddings)
            _, err, r = sess.run(
                [train_step, loss_node, res],
                feed_dict={
                    nodes_node1: nodes11,
                    children_node1: children1,
                    nodes_node2: nodes22,
                    children_node2: children2,
                    labels_node: batch_labels
                }
            )
            if aaa % 1000 == 0:
                print('Epoch:', epoch,
                      'Step:', aaa,
                      'Loss:', err,
                      'R:', r,
                      )
            aaa += 1
        correct_labels_dev = []
        predictions_dev = []
        for i in range(0, 15):
            predictions_dev.append([])
        ff = open("./datasetForCompareWithCDLH/BCB/devdata1.txt", 'r')
        line = "123"
        k = 0
        maxf1value = -1.0
        while line:
            line = ff.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            if (l[0] in listrec) or (l[1] in listrec):
                continue
            label = l[2]
            k += 1
            nodes11, children1, nodes22, children2, _ = getData_finetune(l, dictt, embeddings)
            output = sess.run([res],
                              feed_dict={
                                  nodes_node1: nodes11,
                                  children_node1: children1,
                                  nodes_node2: nodes22,
                                  children_node2: children2,
                              }
                              )
            correct_labels_dev.append(int(label))
            threaholder = -0.7
            for i in range(0, 15):
                if output[0] >= threaholder:
                    predictions_dev[i].append(1)
                else:
                    predictions_dev[i].append(-1)
                threaholder += 0.1
        for i in range(0, 15):
            f1score = f1_score(correct_labels_dev, predictions_dev[i], average='binary')
            if f1score > maxf1value:
                maxf1value = f1score
                maxstep = i

        ff.close()
        correct_labels_test = []
        predictions_test = []
        ff = open("./datasetForCompareWithCDLH/BCB/testdata1.txt", 'r')
        line = "123"
        k = 0
        while line:
            line = ff.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            k += 1
            label = l[2]

            if (l[0] in listrec) or (l[1] in listrec):
                continue
            nodes11, children1, nodes22, children2, _ = getData_finetune(l, dictt, embeddings)
            output = sess.run([res],
                              feed_dict={
                                  nodes_node1: nodes11,
                                  children_node1: children1,
                                  nodes_node2: nodes22,
                                  children_node2: children2,
                              }
                              )
            correct_labels_test.append(int(label))
            threaholder = -0.7+maxstep*0.1
            if output[0] >= threaholder:
                predictions_test.append(1)
            else:
                predictions_test.append(-1)
        print("starttest:\n")
        print("threaholder:")
        print(threaholder)
        p = precision_score(correct_labels_test, predictions_test, average='binary')
        r = recall_score(correct_labels_test, predictions_test, average='binary')
        f1score = f1_score(correct_labels_test, predictions_test, average='binary')
        print("recall_test:" + str(r))
        print("precision_test:" + str(p))
        print("f1score_test:" + str(f1score))
        ff.close()
def train_model(infile, embeddings, epochs=EPOCHS):
    os.environ["CUDA_VISIBLE_DEVICES"] = "4"
    num_feats = 100
    # build the inputs and outputs of the network
    nodes_node1, children_node1, nodes_node2, children_node2, res = network.init_net_finetune(
        num_feats, embeddingg)
    labels_node, loss_node = network.loss_layer(res)

    optimizer = tf.train.GradientDescentOptimizer(LEARN_RATE)
    train_step = optimizer.minimize(loss_node)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # ### init the graph
    sess = tf.Session(
        config=config)  #config=tf.ConfigProto(device_count={'GPU':0}))
    sess.run(tf.global_variables_initializer())
    dictt = {}
    listrec = []
    f = open("flistPOJ.txt", 'r')
    line = f.readline().rstrip('\t')
    l = line.split('\t')
    print(len(l))

    for ll in l:
        if not os.path.exists(ll):
            listrec.append(ll)
            continue
        tree = pycparser.parse_file(ll)
        sample, size = _traverse_tree_withid(tree)
        dictt[ll] = sample
    f.close()
    for epoch in range(1, epochs + 1):
        f = open(infile, 'r')
        line = "123"
        k = 0
        aaa = 0
        while line:
            line = f.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            k += 1
            if (l[0] in listrec) or (l[1] in listrec):
                continue
            nodes11, children1, nodes22, children2, batch_labels = getData_finetune(
                l, dictt, embeddings)
            _, err, r = sess.run(
                [train_step, loss_node, res],
                feed_dict={
                    nodes_node1: nodes11,
                    children_node1: children1,
                    nodes_node2: nodes22,
                    children_node2: children2,
                    labels_node: batch_labels
                })
            if aaa % 1000 == 0:
                print(
                    'Epoch:',
                    epoch,
                    'Step:',
                    aaa,
                    'Loss:',
                    err,
                    'R:',
                    r,
                )
            aaa += 1
        f.close()
        correct_labels_dev = []
        predictions_dev = []
        for reci in range(0, 15):
            predictions_dev.append([])
        ff = open("./datasetForVariantsTBCCD/POJ/devdata.txt", 'r')
        line = "123"
        k = 0
        while line:
            line = ff.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            k += 1
            label = l[2]
            if (l[0] in listrec) or (l[1] in listrec):
                continue
            nodes11, children1, nodes22, children2, _ = getData_finetune(
                l, dictt, embeddings)
            k += 1
            output = sess.run(
                [res],
                feed_dict={
                    nodes_node1: nodes11,
                    children_node1: children1,
                    nodes_node2: nodes22,
                    children_node2: children2,
                })
            correct_labels_dev.append(int(label))
            threaholder = -0.7
            for i in range(0, 15):
                if output[0] >= threaholder:
                    predictions_dev[i].append(1)
                else:
                    predictions_dev[i].append(-1)
                threaholder += 0.1
        maxstep = 0
        maxf1value = 0
        for i in range(0, 15):
            f1score = f1_score(correct_labels_dev,
                               predictions_dev[i],
                               average='binary')
            if f1score > maxf1value:
                maxf1value = f1score
                maxstep = i
        ff.close()
        correct_labels_test = []
        predictions_test = []
        ff = open("./datasetForVariantsTBCCD/POJ/testdata.txt", 'r')
        line = "123"
        k = 0
        while line:
            line = ff.readline().rstrip('\n')
            l = line.split('\t')
            if len(l) != 3:
                break
            k += 1
            label = l[2]

            if (l[0] in listrec) or (l[1] in listrec):
                continue
            nodes11, children1, nodes22, children2, _ = getData_finetune(
                l, dictt, embeddings)
            output = sess.run(
                [res],
                feed_dict={
                    nodes_node1: nodes11,
                    children_node1: children1,
                    nodes_node2: nodes22,
                    children_node2: children2,
                })
            correct_labels_test.append(int(label))
            threaholderr = -0.7 + maxstep * 0.1
            if output[0] >= threaholderr:
                predictions_test.append(1)
            else:
                predictions_test.append(-1)
        ff.close()
        print("starttest:")
        print("threaholderr:")
        print(threaholderr)
        p = precision_score(correct_labels_test,
                            predictions_test,
                            average='binary')
        r = recall_score(correct_labels_test,
                         predictions_test,
                         average='binary')
        f1score = f1_score(correct_labels_test,
                           predictions_test,
                           average='binary')
        print("recall_test:" + str(r))
        print("precision_test:" + str(p))
        print("f1score_test:" + str(f1score))