Exemple #1
0
def get_commit_satisfy_condition(path_data_, nfile, nhunk, nline, nleng):
    commits_structure = extract_commit_july(path_file=path_data_)
    # nfile, nhunk, nline, nleng = 1, 8, 10, 120
    filter_commits = filtering_commit_union(commits=commits_structure,
                                            num_file=nfile,
                                            num_hunk=nhunk,
                                            num_loc=nline,
                                            size_line=nleng)
    print len(commits_structure), len(filter_commits)

    commits = load_file(path_data_)
    indexes = commits_index(commits=commits)
    new_commits = list()
    for i in xrange(0, len(indexes)):
        if i == len(indexes) - 1:
            id = commit_id(commit=commits[indexes[i]:])
            if id in filter_commits:
                new_commits += commits[indexes[i]:]
        else:
            id = commit_id(commit=commits[indexes[i]:indexes[i + 1]])
            if id in filter_commits:
                new_commits += commits[indexes[i]:indexes[i + 1]]
        print i, id
    # write_file("./satisfy_typediff_sorted.out", new_commits)
    write_file(path_data_ + ".satisfy", new_commits)
Exemple #2
0
def creating_sasha_data(path_data_, folds, random_state):
    commits_structure = extract_commit_july(path_file=path_data_)
    commits_id = [c["id"] for c in commits_structure]
    commits_label = ["stable" if c["stable"] == "true" else "nonstable" for c in commits_structure]
    commits_id_label = [id_ + "\t" + label_ for id_, label_ in zip(commits_id, commits_label)]

    kf = KFold(n_splits=folds, random_state=random_state)
    cnt_fold = 1
    for train_index, test_index in kf.split(commits_structure):
        train_id, train_label = get_elements(commits=commits_id, indexes=train_index), get_elements(
            commits=commits_label, indexes=train_index)
        test_id, test_label = get_elements(commits=commits_id, indexes=test_index), get_elements(
            commits=commits_label, indexes=test_index)
        train_file, test_file = get_elements(commits=commits_id_label, indexes=train_index), get_elements(
            commits=commits_id_label, indexes=test_index)
        print len(train_id), len(train_label)
        print len(test_id), len(test_label)
        print len(train_file), len(test_file)

        write_file(path_file="./sasha_data/fold" + str(cnt_fold) + "/" + "train.txt", data=train_file)
        write_file(path_file="./sasha_data/fold" + str(cnt_fold) + "/" + "test.txt", data=test_file)
        cnt_fold += 1
Exemple #3
0
def load_data_type(path, FLAGS):
    commits_ = extract_commit_july(path_file=path)
    msgs_, codes_ = extract_msg(commits=commits_), extract_code(
        commits=commits_)
    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    print len(commits_), len(dict_msg_), len(dict_code_)

    pad_msg = mapping_commit_msg(msgs=msgs_,
                                 max_length=FLAGS.msg_length,
                                 dict_msg=dict_msg_)
    pad_added_code = mapping_commit_code(type="added",
                                         commits=commits_,
                                         max_hunk=FLAGS.code_hunk,
                                         max_code_line=FLAGS.code_line,
                                         max_code_length=FLAGS.code_length,
                                         dict_code=dict_code_)
    pad_removed_code = mapping_commit_code(type="removed",
                                           commits=commits_,
                                           max_hunk=FLAGS.code_hunk,
                                           max_code_line=FLAGS.code_line,
                                           max_code_length=FLAGS.code_length,
                                           dict_code=dict_code_)
    labels = load_label_commits(commits=commits_)
    return pad_msg, pad_added_code, pad_removed_code, labels, dict_msg_, dict_code_

def print_false_negative(id, y_pred, threshold, y_true):
    y_pred = [1 if float(y) > threshold else 0 for y in y_pred]
    false_negative = []
    for i, p, t in zip(id, y_pred, y_true):
        if p == 0 and t == 1:
            false_negative.append(i)
    print len(false_negative)
    path_write = "./sasha_results/false_neg_%s.txt" % (str(threshold))
    write_file(path_file=path_write, data=false_negative)


if __name__ == "__main__":
    path_data = "./newres_funcalls_jul28.out.sorted.satisfy"
    commits_structure = extract_commit_july(path_file=path_data)
    commits_id = [c["id"] for c in commits_structure]

    path_true = "./statistical_test_prob_ver3/true_label.txt"
    y_true = load_file(path_file=path_true)
    y_true = [int(y) for y in y_true]

    path_pred, threshold = "./statistical_test_prob_ver3/sasha_results.txt", 50
    y_pred = load_file(path_file=path_pred)

    print_true_positive(id=commits_id,
                        y_pred=y_pred,
                        threshold=threshold,
                        y_true=y_true)
    print_false_negative(id=commits_id,
                         y_pred=y_pred,
def checking_performance(id_label, true_label, model_label, model_name):
    for i in range(5, 90, 1):
        if model_name == "patchNet":
            threshold = 1 - i / float(100)
        elif model_name == "sasha":
            threshold = 0
        threshold_label = [1 if m >= threshold else 0 for m in model_label]
        prc = precision_score(y_true=true_label, y_pred=threshold_label)
        rc = recall_score(y_true=true_label, y_pred=threshold_label)
        print threshold, prc, rc
    exit()


if __name__ == "__main__":
    path_data = "./satisfy_typediff_sorted.out"
    commits_ = extract_commit_july(path_file=path_data)
    print len(commits_), type(commits_)
    commits_id = [c["id"] for c in commits_]
    print len(commits_id)

    path_file = "./statistical_test_prob_ver2/true_label.txt"
    true_label = load_file(path_file=path_file)
    true_label = [float(t) for t in true_label]
    path_file = "./statistical_test_prob_ver2/PatchNet.txt"
    patchNet = load_file(path_file=path_file)
    patchNet = [float(t) for t in patchNet]
    checking_performance(id_label=commits_id,
                         true_label=true_label,
                         model_label=patchNet,
                         model_name="patchNet")
Exemple #6
0

if __name__ == "__main__":
    # root_path_ = "./sasha_results/"
    # files_path = ([root_path_ + x for x in os.listdir(root_path_)])
    # print files_path

    files_path = list()
    files_path.append("./sasha_results/fold1.txt")
    files_path.append("./sasha_results/fold2.txt")
    files_path.append("./sasha_results/fold3.txt")
    files_path.append("./sasha_results/fold4.txt")
    files_path.append("./sasha_results/fold5.txt")

    root_gt = "./satisfy_typediff_sorted.out"
    commits_ = extract_commit_july(path_file=root_gt)
    commits_id_ = [c["id"] for c in commits_]
    commits_label_ = [c["stable"] for c in commits_]

    dl_labels = load_file("./statistical_test/lstm_cnn_all.txt")
    dl_labels = [float(l) for l in dl_labels]

    fold_index = split_train_test(data=commits_, folds=5, random_state=None)

    accuracy_, precision_, recall_, f1_, auc_ = list(), list(), list(), list(
    ), list()
    probs, true_pos, false_neg = list(), list(), list()
    threshold = 0.5
    for f in files_path:
        print f
        test_fold = fold_index[check_fold(string=f) - 1]["test"]
Exemple #7
0
def loading_baseline_july(tf, folds, random_state):
    FLAGS = tf.flags.FLAGS
    commits_ = extract_commit_july(path_file=FLAGS.path)
    filter_commits = commits_
    print len(commits_)

    kf = KFold(n_splits=folds, random_state=random_state)
    idx_folds = list()
    for train_index, test_index in kf.split(filter_commits):
        idx = dict()
        idx["train"], idx["test"] = train_index, test_index
        idx_folds.append(idx)

    if "msg" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
    elif "all" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines
    elif "code" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
        msgs_ = codes_
    else:
        print "You need to type correct model"
        exit()
    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    pad_msg = mapping_commit_msg(msgs=msgs_,
                                 max_length=FLAGS.msg_length,
                                 dict_msg=dict_msg_)
    labels = load_label_commits(commits=filter_commits)
    labels = convert_to_binary(labels)

    # path_file = "./statistical_test_prob/true_label.txt"
    # write_file(path_file=path_file, data=labels)
    # exit()

    print pad_msg.shape, labels.shape, len(dict_msg_)
    cntfold = 0
    pred_dict = dict()
    pred_dict_list = list()
    for i in xrange(cntfold, len(idx_folds)):
        idx = idx_folds[i]
        train_index, test_index = idx["train"], idx["test"]
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))
        if FLAGS.model == "lstm_cnn_all" or FLAGS.model == "lstm_cnn_msg" \
                or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "cnn_all" \
                or FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code":
            # path_model = "./keras_model/%s_%s.h5" % (FLAGS.model, str(cntfold))
            path_model = "./keras_model/test_%s_%s.h5" % (FLAGS.model,
                                                          str(cntfold))
            # path_model = "./keras_model/%s_%s_testing.h5" % (FLAGS.model, str(cntfold))
            model = load_model(path_model)
        else:
            print "You need to give correct model name"
            exit()
        y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size)
        y_pred = np.ravel(y_pred)

        pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index))

        y_pred = y_pred.tolist()
        pred_dict_list += y_pred
    # print len(pred_dict_list)
    # exit()
    # path_file = "./statistical_test_prob/" + FLAGS.model + ".txt"
    # write_file(path_file=path_file, data=sorted_dict(dict=pred_dict))
    path_file = "./statistical_test_prob/" + FLAGS.model + "_checking.txt"
    write_file(path_file=path_file, data=pred_dict_list)
Exemple #8
0
def running_baseline_july(tf, folds, random_state):
    FLAGS = tf.flags.FLAGS
    commits_ = extract_commit_july(path_file=FLAGS.path)
    filter_commits = commits_
    print len(commits_)
    kf = KFold(n_splits=folds, random_state=random_state)
    idx_folds = list()
    for train_index, test_index in kf.split(filter_commits):
        idx = dict()
        idx["train"], idx["test"] = train_index, test_index
        idx_folds.append(idx)

    if "msg" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
    elif "all" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines
    elif "code" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        msgs_ = codes_
    else:
        print "You need to type correct model"
        exit()

    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
    labels = load_label_commits(commits=filter_commits)
    labels = convert_to_binary(labels)
    print pad_msg.shape, labels.shape, len(dict_msg_)
    # exit()

    timestamp = str(int(time.time()))
    accuracy, precision, recall, f1, auc = list(), list(), list(), list(), list()
    cntfold = 0
    pred_dict, pred_dict_prob = dict(), dict()
    for i in xrange(cntfold, len(idx_folds)):
        idx = idx_folds[i]
        train_index, test_index = idx["train"], idx["test"]
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))
        if FLAGS.model == "lstm_cnn_msg" or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "lstm_cnn_all":
            model = lstm_cnn(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg,
                             y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS)
        elif FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code" or FLAGS.model == "cnn_all":
            model = cnn_model(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg,
                              y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS)
        else:
            print "You need to give correct model name"
            exit()

        # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + ".h5")
        # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + "_testing.h5")
        # model.save("./keras_model/test_" + FLAGS.model + "_" + str(cntfold) + ".h5")
        model.save("./keras_model/newres_funcalls_" + FLAGS.model + "_" + str(cntfold) + ".h5")

        y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size)
        y_pred = np.ravel(y_pred)

        y_pred_tolist = y_pred.tolist()
        data_fold = [str(i) + "\t" + str(l) for i, l in zip(test_index, y_pred)]
        path_file = "./statistical_test/newres_funcalls_%s_fold_%s.txt" % (FLAGS.model, str(cntfold))
        write_file(path_file=path_file, data=data_fold)

        y_pred[y_pred > 0.5] = 1
        y_pred[y_pred <= 0.5] = 0

        pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index))
        accuracy.append(accuracy_score(y_true=Y_test, y_pred=y_pred))
        precision.append(precision_score(y_true=Y_test, y_pred=y_pred))
        recall.append(recall_score(y_true=Y_test, y_pred=y_pred))
        f1.append(f1_score(y_true=Y_test, y_pred=y_pred))
        auc.append(auc_score(y_true=Y_test, y_pred=y_pred))
        print "accuracy", accuracy_score(y_true=Y_test, y_pred=y_pred)
        print "precision", precision_score(y_true=Y_test, y_pred=y_pred)
        print "recall", recall_score(y_true=Y_test, y_pred=y_pred)
        print "f1", f1_score(y_true=Y_test, y_pred=y_pred)

        cntfold += 1
        break
# sys.path.append(path_working)

from init_params import model_parameters
from keras_lstm import print_params
from ultis import extract_commit_july
from keras_lstm import lstm_cnn, cnn_model
from baselines import extract_msg, extract_code, add_two_list
from data_helpers import dictionary, mapping_commit_msg, load_label_commits
from data_helpers import convert_to_binary

tf_ = model_parameters()
FLAGS_ = tf_.flags.FLAGS
print_params(tf_)
FLAGS = FLAGS_

commits_ = extract_commit_july(path_file=FLAGS.path)
filter_commits = commits_

if "msg" in FLAGS.model:
    msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
        commits=filter_commits)
elif "all" in FLAGS.model:
    msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
        commits=filter_commits)
    all_lines = add_two_list(list1=msgs_, list2=codes_)
    msgs_ = all_lines
elif "code" in FLAGS.model:
    msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
        commits=filter_commits)
    msgs_ = codes_
else: