Exemple #1
0
def qualitative_looking(path_correctId, path_label):
    id_root = load_file(path_file=path_correctId)
    print len(id_root)

    data_label = load_file(path_file=path_label)
    id_label = [d.split("\t")[0] for d in data_label]
    print len(data_label)

    index_id_root = [id_label.index(i) for i in id_root]

    for id_ in id_root:
        path_id = "./qualitative_analysis/cosine_sim/" + id_ + ".txt"
        cosine_data = load_file(path_file=path_id)
        print len(cosine_data)
        cosine_data = map(float, cosine_data)
        order_cosine = sorted(cosine_data, key=float, reverse=True)
        write_data = list()
        write_data = dict()
        for jid in index_id_root:
            name_id = id_label[jid]
            cosine_score = cosine_data[jid]
            position_ = order_cosine.index(cosine_score)
            # print name_id + "\t" + str(cosine_score) + "\t" + str(position_ + 1)
            # write_data.append(name_id + "\t" + str(cosine_score) + "\t" + str(position_ + 1))
            write_data[name_id] = position_ + 1

        new_write_data = list()
        for w in sorted(write_data, key=write_data.get):
            print w, write_data[w]
            new_write_data.append(w + "\t" + str(write_data[w]))

        path_write = "./qualitative_analysis/cosine_sim_order/" + id_ + ".txt"
        write_file(path_file=path_write, data=new_write_data)
def loading_testing_data():
    data, paths = list(), list()
    paths.append("./data/test_data/features_markusinfo.txt")
    # paths.append("./data/test_data/features_nicholaskinfo.txt")
    # paths.append("./data/test_data/features_sashainfo.txt")
    for p in paths:
        data_ = load_file(path_file=p)
        data += data_
    ids_, X_ = load_data_ICSE_new(data=data)
    print len(ids_), X_.shape

    data_gt, path_gt = list(), list()
    # path_gt.append("./data/test_data/markus_translated.out")
    # path_gt.append("./data/test_data/nicholask_translated.out")
    path_gt.append("./data/test_data/sasha_translated.out")
    print path_gt

    for p in path_gt:
        p_data = load_file(path_file=p)
        data_gt += p_data
    commits = extract_commit_new(commits=data_gt)
    nfile, nhunk, nline, nleng = 1, 8, 10, 120
    commits_ = get_commits(commits=filtering_commit(commits=commits,
                                                    num_file=nfile,
                                                    num_hunk=nhunk,
                                                    num_loc=nline,
                                                    size_line=nleng), ids=ids_)
    ids_index = [ids_.index(c["id"]) for c in commits_]
    ids_ = [ids_[i] for i in ids_index]
    X_ = X_[ids_index, :]
    y_ = [1 if c["stable"] == "true" else 0 for c in commits_]
    return commits_, ids_, X_, np.array(y_)
Exemple #3
0
def collect_labels(path_data, path_label):
    valid_ids = get_ids(
        [f for f in listdir(path_data) if isfile(join(path_data, f))])
    ids, labels = [
        l.split('\t')[0] for l in load_file(path_file=path_label)
    ], [l.split('\t')[1] for l in load_file(path_file=path_label)]
    labels_valid_ids = [
        labels[ids.index(v_id)] for v_id in valid_ids if v_id in ids
    ]
    return valid_ids, labels_valid_ids
Exemple #4
0
def finding_id(path_label, path_root):
    data_label = load_file(path_file=path_label)
    id_label = [d.split("\t")[0] for d in data_label]
    gt_label = [1 if d.split("\t")[1] == "true" else 0 for d in data_label]

    data_pred = load_file(path_file=path_root)
    label_pred = [float(d) for d in data_pred]

    id_correct = list()
    for i in xrange(len(id_label)):
        if gt_label[i] == label_pred[i] and gt_label[i] == 0:
            id_correct.append(id_label[i])
    return id_correct
Exemple #5
0
def load_probability_score(model, threshold):
    path_file = "./statistical_test_prob_ver3/%s.txt" % model
    if model == "sasha_results":
        y_pred = load_sasha_results_ver2(path_file=path_file,
                                         threshold=threshold)
    elif model == "true_label":
        y_pred = load_file(path_file=path_file)
        y_pred = np.array([float(y) for y in y_pred])
    else:
        y_pred = load_file(path_file=path_file)
        y_pred = np.array([float(y) for y in y_pred])
        y_pred[y_pred > 0.5] = 1
        y_pred[y_pred <= 0.5] = 0
    return y_pred
Exemple #6
0
def get_all_checkpoints(checkpoint_dir):
    files = load_file(checkpoint_dir + "/checkpoint")
    files = files[1:]
    dirs = []
    for f in files:
        dirs.append(get_checkpoint_directory(directory=f))
    return dirs
Exemple #7
0
def get_commit_satisfy_condition(path_data_, nfile, nhunk, nline, nleng):
    commits_structure = extract_commit_july(path_file=path_data_)
    # nfile, nhunk, nline, nleng = 1, 8, 10, 120
    filter_commits = filtering_commit_union(commits=commits_structure,
                                            num_file=nfile,
                                            num_hunk=nhunk,
                                            num_loc=nline,
                                            size_line=nleng)
    print len(commits_structure), len(filter_commits)

    commits = load_file(path_data_)
    indexes = commits_index(commits=commits)
    new_commits = list()
    for i in xrange(0, len(indexes)):
        if i == len(indexes) - 1:
            id = commit_id(commit=commits[indexes[i]:])
            if id in filter_commits:
                new_commits += commits[indexes[i]:]
        else:
            id = commit_id(commit=commits[indexes[i]:indexes[i + 1]])
            if id in filter_commits:
                new_commits += commits[indexes[i]:indexes[i + 1]]
        print i, id
    # write_file("./satisfy_typediff_sorted.out", new_commits)
    write_file(path_data_ + ".satisfy", new_commits)
Exemple #8
0
def load_sasha_results(path_file, threshold):
    y_pred = load_file(path_file=path_file)
    y_pred = [float(y) for y in y_pred]
    max_value = sorted(y_pred,
                       reverse=True)[int(len(y_pred) * (threshold - 0.05))]
    y_pred = [1 if y > max_value else 0 for y in y_pred]
    return np.array(y_pred)
def draw_prc_recall_curve(y_true, path_file, point):
    data = load_file(path_file=path_file)
    data = np.array([float(y) for y in data])
    prc, rc, threshold = metrics.precision_recall_curve(y_true, data)
    new_prc, new_rc = list(), list()
    for i in xrange(0, len(prc), int(len(prc) / point)):
        new_prc.append(prc[i])
        new_rc.append(rc[i])
    return new_prc[:point], new_rc[:point]
def load_data_ICSE(path):
    data = load_file(path_file=path)
    ids, ftrs, labels = list(), list(), list()
    for d in data:
        split_ = d.split(",")
        id_, ftr_ = split_[0], map(int, split_[1:len(split_) - 1])
        label_ = split_[len(split_) - 1]
        ids.append(id_)
        ftrs.append(np.array(ftr_))
        labels.append(label_)
    labels = [1 if v.strip() == "true" else 0 for v in labels]
    return ids, np.array(ftrs), np.array(labels)
def balance_data_ICSE(path):
    data = load_file(path_file=path)
    new_data, cnt = list(), 0
    for d in data:
        if "true" in d and cnt <= 11165:
            new_data.append(d.strip())
            cnt += 1
        elif "false" in d:
            new_data.append(d.strip())
    shuffle(new_data)
    write_file(path_file="./data/3_mar7/new_features_ver1.txt", data=new_data)
    exit()
Exemple #12
0
def evaluation_metrics(path, labels):
    pred_score = load_file(path_file=path)
    pred_score = np.array([float(score) for score in pred_score])
    labels = labels[:pred_score.shape[0]]

    acc = accuracy_score(y_true=labels, y_pred=convert_to_binary(pred_score))
    prc = precision_score(y_true=labels, y_pred=convert_to_binary(pred_score))
    rc = recall_score(y_true=labels, y_pred=convert_to_binary(pred_score))
    f1 = f1_score(y_true=labels, y_pred=convert_to_binary(pred_score))
    auc = roc_auc_score(y_true=labels, y_score=pred_score)

    print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' %
          (acc, prc, rc, f1, auc))
def load_commit_code(path_file):
    code = load_file(path_file=path_file)
    indexes = diff_file_index(code=code)
    diffs = list()
    for i in range(0, len(indexes)):
        dict = {}
        if i == len(indexes) - 1:
            file, diff = diff_code(code[indexes[i]:])
        else:
            file, diff = diff_code(code[indexes[i]:indexes[i + 1]])
        dict['file'] = file
        dict['diff'] = diff
        diffs.append(dict)
    return diffs
Exemple #14
0
def cosine_similarity(path_root, id_commit, data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    data_root = load_file(path_file=path_root)
    for id_root in data_root:
        results = list()
        index_ = id_commit.index(id_root)
        X_root = X[index_, :].toarray().flatten()
        for i in xrange(len(id_commit)):
            results.append(
                1 -
                spatial.distance.cosine(X_root, X[i, :].toarray().flatten()))
        write_file(path_file="./qualitative_analysis/cosine_sim/" + id_root +
                   ".txt",
                   data=results)
    return None
def load_df_yasu_data(path_data, path_file):
    data = pd.read_csv(path_data)
    data = replace_value_dataframe(df=data)
    ids, labels, features = get_ids(data=data), get_label(data=data), get_features(data=data)
    indexes, new_ids, new_labels, new_features = list(), list(), list(), list()
    cnt_noexits = 0
    for i in range(0, len(ids)):
        try:
            data = load_file(path_file=path_file + '/' + ids[i] + '.diff')
            indexes.append(i)
        except FileNotFoundError:
            print('File commit id no exits', ids[i], cnt_noexits)
            cnt_noexits += 1
    ids = [ids[i] for i in indexes]
    labels = [labels[i] for i in indexes]
    features = features[indexes]
    return (ids, np.array(labels), features)
Exemple #16
0
def load_results(id_gt, label_gt, single_file, threshold):
    lines = load_file(single_file)
    patches = dict()
    for l in lines:
        # patch = dict()
        split_l = l.split()
        # patch["id"], patch["score"] = split_l[0], float(split_l[1])
        # patches.append(patch)
        patches[split_l[0]] = float(split_l[1])

    patches = sorted(patches.items(), key=lambda x: x[1], reverse=True)
    # acc, prc, rc, f1, auc = checking_performance(id_gt=id_gt, label_gt=label_gt, patches=patches)
    # acc, prc, rc, f1, auc, prob, true_positive, false_negative = checking_performance_v2(id_gt=id_gt, label_gt=label_gt,
    #                                                                                      patches=patches)
    acc, prc, rc, f1, auc, prob, true_positive, false_negative = checking_performance_v3(
        id_gt=id_gt, label_gt=label_gt, patches=patches, threshold=threshold)
    return acc, prc, rc, f1, auc, prob, true_positive, false_negative
Exemple #17
0
def restruct_root(roots, path, type):
    min_files = min([len(r) for r in roots])
    new_roots = [r[:min_files] for r in roots]
    print len(new_roots), len(new_roots[0])

    for i in xrange(0, len(new_roots[0])):
        model = list()
        for j in xrange(0, len(new_roots)):
            print path + "/" + new_roots[j][i]
            model += load_file(path + "/" + new_roots[j][i])
        model_name = "model-" + new_roots[j][i].split("-")[-1].replace(
            ".txt", "")
        print type, model_name
        # exit()
        path_write = "./patchNet_mergeResults/%s_%s.txt" % (type, model_name)
        write_file(path_file=path_write, data=model)
    return None
Exemple #18
0
def get_commit_id_and_date(path_data_):
    commits = load_file(path_data_)
    indexes = commits_index(commits=commits)
    dicts = {}
    for i in xrange(0, len(indexes)):
        if i == len(indexes) - 1:
            date = commit_date_july(commit=commits[indexes[i]:])
        else:
            date = commit_date_july(commit=commits[indexes[i]:indexes[i + 1]])
        dicts[i] = int(date)
    sort_dicts = sorted(dicts.items(), key=operator.itemgetter(1))
    new_commits = list()
    for d in sort_dicts:
        index, date = d[0], d[1]
        print index, date
        if index == len(sort_dicts) - 1:
            new_commits += commits[indexes[index]:]
        else:
            new_commits += commits[indexes[index]:indexes[index + 1]]
    # write_file("./typediff_sorted.out", new_commits)
    write_file(path_data_ + ".sorted", new_commits)
Exemple #19
0
def collect_labels_ver2(path_label):
    ids, labels = [l.split('\t')[0] for l in load_file(path_file=path_label)], [l.split('\t')[1] for l in
                                                                                load_file(path_file=path_label)]
    return ids, labels
def checking_performance(id_label, true_label, model_label, model_name):
    for i in range(5, 90, 1):
        if model_name == "patchNet":
            threshold = 1 - i / float(100)
        elif model_name == "sasha":
            threshold = 0
        threshold_label = [1 if m >= threshold else 0 for m in model_label]
        prc = precision_score(y_true=true_label, y_pred=threshold_label)
        rc = recall_score(y_true=true_label, y_pred=threshold_label)
        print threshold, prc, rc
    exit()


if __name__ == "__main__":
    path_data = "./satisfy_typediff_sorted.out"
    commits_ = extract_commit_july(path_file=path_data)
    print len(commits_), type(commits_)
    commits_id = [c["id"] for c in commits_]
    print len(commits_id)

    path_file = "./statistical_test_prob_ver2/true_label.txt"
    true_label = load_file(path_file=path_file)
    true_label = [float(t) for t in true_label]
    path_file = "./statistical_test_prob_ver2/PatchNet.txt"
    patchNet = load_file(path_file=path_file)
    patchNet = [float(t) for t in patchNet]
    checking_performance(id_label=commits_id,
                         true_label=true_label,
                         model_label=patchNet,
                         model_name="patchNet")
def load_commit_msg(path_file):
    msg = ' '.join(load_file(path_file=path_file))
    new_msg = dict()
    new_msg['title'], new_msg['desc'] = re.compile('<title>(.*?)</title>', re.DOTALL).findall(msg)[0], \
                                        re.compile('<message>(.*?)</message>', re.DOTALL).findall(msg)[0]
    return new_msg
Exemple #22
0

if __name__ == "__main__":
    path_data = "./data/test_data/merging_markus_sasha.txt"
    commits_ = extract_commit(path_file=path_data)
    nfile, nhunk, nline, nleng = 1, 8, 10, 120
    filter_commits = filtering_commit(commits=commits_,
                                      num_file=nfile,
                                      num_hunk=nhunk,
                                      num_loc=nline,
                                      size_line=nleng)
    ids_ = [c["id"] for c in filter_commits]
    labels_ = [1 if c["stable"] == "true" else 0 for c in filter_commits]

    path_ftr = "./data/test_data/features_merging_markus_sasha.txt"
    ftr = load_file(path_file=path_ftr)
    new_ftr = clean_merging_data(ids=ids_, ftrs=ftr)

    commits_test, ids_test, X_ftr_test, y_test = loading_testing_data(
        ftr_data=new_ftr, commit_data=filter_commits)
    commits_train, ids_train, X_ftr_train, y_train = loading_training_data()

    # type = "msg"
    # type = "code"
    type = "msg_code"
    print type
    # if type == "msg":
    #     X_msg_train, X_msg_test = create_features_ICSE_new(commits_train=commits_train, ids_train=ids_train,
    #                                                        commits_test=commits_test, ids_test=ids_test, type=type)
    #     X_train = X_msg_train
    #     X_test = X_msg_test
Exemple #23
0
    print len(filter_commits), type(filter_commits)
    commits_id = [c["id"] for c in commits_]
    print len(commits_id)
    # load_model_labels(id=commits_id)

    msgs = extract_msg(commits=filter_commits)
    labels = extract_label(commits=filter_commits)
    codes = extract_code(commits=filter_commits)
    all_lines = add_two_list(list1=msgs, list2=codes)
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(all_lines)
    print X.shape
    # exit()

    path_good_commits = "./statistical_test_prob_ver3/good_commits.txt"
    good_commits = load_file(path_file=path_good_commits)
    print "Leng of good commits: %s" % (str(len(good_commits)))

    write_data = []
    for g in good_commits:
        write_data += similarity_good_commit(id=commits_id,
                                             root=g,
                                             all=X,
                                             top_k=50)
        # break
    path_write = "./statistical_test_prob_ver2/good_commits_results.txt"
    write_file(path_file=path_write, data=write_data)
    # exit()
    ####################################################################################
    ####################################################################################
    path_bad_commits = "./statistical_test_prob_ver2/bad_commits.txt"
Exemple #24
0
from ultis import load_file, extract_commit_new
from baselines import extract_msg, extract_code, add_two_list, extract_label
import numpy as np

if __name__ == "__main__":
    path_test = list()
    path_test.append("./data/test_data/markus_translated.out")
    path_test.append("./data/test_data/nicholask_translated.out")
    path_test.append("./data/test_data/sasha_translated.out")

    path_dict = "./data/3_mar7/newres.simplified.dict"
    dict_index = load_file(path_file=path_dict)
    new_dict = {}
    for d in dict_index:
        split_d = d.strip().split(":")
        new_dict[int(split_d[0])] = split_d[1]

    data = list()
    for p in path_test:
        p_data = load_file(path_file=p)
        data += p_data
    commits_ = extract_commit_new(commits=data)
    msgs = extract_msg(commits=commits_)
    codes = extract_code(commits=commits_)
    all_lines = add_two_list(list1=msgs, list2=codes)
    labels = extract_label(commits=commits_)

    # pos_label = len([1 for l in labels if l == 1])
    # neg_label = len([0 for l in labels if l == 0])
    print len(labels), np.count_nonzero(np.array(labels))
def draw_roc_curve(path_file):
    data = load_file(path_file=path_file)
    data = np.array([float(y) for y in data])
    fpr, tpr, threshold = metrics.roc_curve(y_true, data)
    roc_auc = metrics.auc(fpr, tpr)
    return fpr, tpr, roc_auc

def draw_prc_recall_curve(y_true, path_file, point):
    data = load_file(path_file=path_file)
    data = np.array([float(y) for y in data])
    prc, rc, threshold = metrics.precision_recall_curve(y_true, data)
    new_prc, new_rc = list(), list()
    for i in xrange(0, len(prc), int(len(prc) / point)):
        new_prc.append(prc[i])
        new_rc.append(rc[i])
    return new_prc[:point], new_rc[:point]


if __name__ == "__main__":
    path_true = "./statistical_test_prob/true_label.txt"
    y_true = load_file(path_file=path_true)
    y_true = np.array([int(y) for y in y_true])

    path_sasha = "./statistical_test_prob_ver3/sasha_results.txt"
    fpr_sasha, tpr_sasha, roc_auc_sasha = draw_roc_curve(path_file=path_sasha)

    path_PatchNet = "./statistical_test_prob_ver3/PatchNet.txt"
    fpr_PatchNet, tpr_PatchNet, roc_auc_PatchNet = draw_roc_curve(
        path_file=path_PatchNet)

    # path_lstm = "./statistical_test_prob/lstm_cnn_all.txt"
    # fpr_lstm, tpr_lstm, roc_auc_lstm = draw_roc_curve(path_file=path_lstm)
    #
    # path_cnn = "./statistical_test_prob/cnn_all.txt"
    # fpr_cnn, tpr_cnn, roc_auc_cnn = draw_roc_curve(path_file=path_cnn)
    #
def print_false_negative(id, y_pred, threshold, y_true):
    y_pred = [1 if float(y) > threshold else 0 for y in y_pred]
    false_negative = []
    for i, p, t in zip(id, y_pred, y_true):
        if p == 0 and t == 1:
            false_negative.append(i)
    print len(false_negative)
    path_write = "./sasha_results/false_neg_%s.txt" % (str(threshold))
    write_file(path_file=path_write, data=false_negative)


if __name__ == "__main__":
    path_data = "./newres_funcalls_jul28.out.sorted.satisfy"
    commits_structure = extract_commit_july(path_file=path_data)
    commits_id = [c["id"] for c in commits_structure]

    path_true = "./statistical_test_prob_ver3/true_label.txt"
    y_true = load_file(path_file=path_true)
    y_true = [int(y) for y in y_true]

    path_pred, threshold = "./statistical_test_prob_ver3/sasha_results.txt", 50
    y_pred = load_file(path_file=path_pred)

    print_true_positive(id=commits_id,
                        y_pred=y_pred,
                        threshold=threshold,
                        y_true=y_true)
    print_false_negative(id=commits_id,
                         y_pred=y_pred,
                         threshold=threshold,
                         y_true=y_true)
Exemple #28
0
    new_dict = {}
    for i in index_:
        new_dict[i] = dictionary[i]
    new_list = list()
    for key, value in sorted(new_dict.iteritems()):
        new_list.append(str(key) + ": " + value)
    return new_list


if __name__ == "__main__":
    path_data = "./data/3_mar7/typediff.out"
    commits_ = extract_commit(path_file=path_data)
    msgs = extract_msg(commits=commits_)
    codes = extract_code(commits=commits_)
    all_lines = add_two_list(list1=msgs, list2=codes)
    print len(all_lines), len(commits_), len(msgs), len(codes)
    index = create_dict(all_lines)
    print len(index)

    path_dict = "./data/3_mar7/newres.dict"
    dict_index = load_file(path_file=path_dict)
    new_dict = {}
    for d in dict_index:
        split_d = d.strip().split(":")
        new_dict[int(split_d[0])] = split_d[1]
    print len(new_dict)
    new_dict = mapping_dict(index_=index, dictionary=new_dict)
    path_write = "./data/3_mar7/newres.simplified.dict"
    write_file(path_file=path_write, data=new_dict)

Exemple #29
0
    dict_msg = tokenize_commit_msg(data=data)
    labels_ = np.array([1 if w in d.split() else 0 for d in data for w in dict_msg])
    labels_ = np.reshape(labels_, (int(labels_.shape[0] / len(dict_msg)), len(dict_msg)))
    return labels_, dict_msg


if __name__ == '__main__':
    # create padding for commit code
    ##################################################################################
    ##################################################################################
    # path_train_diff = './data/2017_ASE_Jiang/train.26208.diff'
    # data_train_diff = load_Jiang_code_data(pfile=path_train_diff)
    # path_test_diff = './data/2017_ASE_Jiang/test.3000.diff'
    # data_test_diff = load_Jiang_code_data(pfile=path_test_diff)
    # data_diff = data_train_diff + data_test_diff
    # print(len(data_diff))
    # max_line, max_length = 15, 40
    # padding_commit_code(data=data_diff, max_line=max_line, max_length=max_length)

    # create label using the commit message
    ##################################################################################
    ##################################################################################
    path_train_msg = './data/2017_ASE_Jiang/train.26208.msg'
    data_train_msg = load_file(path_file=path_train_msg)
    path_test_msg = './data/2017_ASE_Jiang/test.3000.msg'
    data_test_msg = load_file(path_file=path_test_msg)
    print(len(data_train_msg + data_test_msg))
    data = data_train_msg + data_test_msg
    label, dict_msg = commit_msg_label(data=data)
    print(label.shape, len(dict_msg))
def finding_element(data, indexes):
    new_data = [data[i] for i in indexes]
    return new_data


def evaluation_metrics(root, target):
    print "Accuracy: %f" % (accuracy_score(y_true=root, y_pred=target))
    print "Precision: %f" % (precision_score(y_true=root, y_pred=target))
    print "Recall: %f" % (recall_score(y_true=root, y_pred=target))
    print "F1: %f" % (f1_score(y_true=root, y_pred=target))
    print "AUC: %f" % (auc_score(y_true=root, y_pred=target))


if __name__ == "__main__":
    path_gt = "./data/3_mar7/typediff_test_ver2.out"
    data_gt = load_file(path_gt)
    id_gt, lbl_gt = processing_gt(data=data_gt)

    path_bf = "./data/typediff_bug_and_fix.txt"
    data_bf = load_file(path_bf)
    id_bf, lbl_bf = processing_bug_fix(data=data_bf)
    indexes_ = finding_index(ids_root=id_gt, ids_target=id_bf)
    print len(indexes_)

    id_bf, lbl_bf = finding_element(
        data=id_bf, indexes=indexes_), finding_element(data=lbl_bf,
                                                       indexes=indexes_)
    evaluation_metrics(root=lbl_gt, target=lbl_bf)

    # path_write = "./data/typediff_bug_and_fix_ver2.txt"
    # write_file(path_file=path_write, data=lbl_bf)