def get_commit_satisfy_condition(path_data_, nfile, nhunk, nline, nleng): commits_structure = extract_commit_july(path_file=path_data_) # nfile, nhunk, nline, nleng = 1, 8, 10, 120 filter_commits = filtering_commit_union(commits=commits_structure, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) print len(commits_structure), len(filter_commits) commits = load_file(path_data_) indexes = commits_index(commits=commits) new_commits = list() for i in xrange(0, len(indexes)): if i == len(indexes) - 1: id = commit_id(commit=commits[indexes[i]:]) if id in filter_commits: new_commits += commits[indexes[i]:] else: id = commit_id(commit=commits[indexes[i]:indexes[i + 1]]) if id in filter_commits: new_commits += commits[indexes[i]:indexes[i + 1]] print i, id # write_file("./satisfy_typediff_sorted.out", new_commits) write_file(path_data_ + ".satisfy", new_commits)
def creating_sasha_data(path_data_, folds, random_state): commits_structure = extract_commit_july(path_file=path_data_) commits_id = [c["id"] for c in commits_structure] commits_label = ["stable" if c["stable"] == "true" else "nonstable" for c in commits_structure] commits_id_label = [id_ + "\t" + label_ for id_, label_ in zip(commits_id, commits_label)] kf = KFold(n_splits=folds, random_state=random_state) cnt_fold = 1 for train_index, test_index in kf.split(commits_structure): train_id, train_label = get_elements(commits=commits_id, indexes=train_index), get_elements( commits=commits_label, indexes=train_index) test_id, test_label = get_elements(commits=commits_id, indexes=test_index), get_elements( commits=commits_label, indexes=test_index) train_file, test_file = get_elements(commits=commits_id_label, indexes=train_index), get_elements( commits=commits_id_label, indexes=test_index) print len(train_id), len(train_label) print len(test_id), len(test_label) print len(train_file), len(test_file) write_file(path_file="./sasha_data/fold" + str(cnt_fold) + "/" + "train.txt", data=train_file) write_file(path_file="./sasha_data/fold" + str(cnt_fold) + "/" + "test.txt", data=test_file) cnt_fold += 1
def load_data_type(path, FLAGS): commits_ = extract_commit_july(path_file=path) msgs_, codes_ = extract_msg(commits=commits_), extract_code( commits=commits_) dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) print len(commits_), len(dict_msg_), len(dict_code_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=commits_, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=commits_, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=commits_) return pad_msg, pad_added_code, pad_removed_code, labels, dict_msg_, dict_code_
def print_false_negative(id, y_pred, threshold, y_true): y_pred = [1 if float(y) > threshold else 0 for y in y_pred] false_negative = [] for i, p, t in zip(id, y_pred, y_true): if p == 0 and t == 1: false_negative.append(i) print len(false_negative) path_write = "./sasha_results/false_neg_%s.txt" % (str(threshold)) write_file(path_file=path_write, data=false_negative) if __name__ == "__main__": path_data = "./newres_funcalls_jul28.out.sorted.satisfy" commits_structure = extract_commit_july(path_file=path_data) commits_id = [c["id"] for c in commits_structure] path_true = "./statistical_test_prob_ver3/true_label.txt" y_true = load_file(path_file=path_true) y_true = [int(y) for y in y_true] path_pred, threshold = "./statistical_test_prob_ver3/sasha_results.txt", 50 y_pred = load_file(path_file=path_pred) print_true_positive(id=commits_id, y_pred=y_pred, threshold=threshold, y_true=y_true) print_false_negative(id=commits_id, y_pred=y_pred,
def checking_performance(id_label, true_label, model_label, model_name): for i in range(5, 90, 1): if model_name == "patchNet": threshold = 1 - i / float(100) elif model_name == "sasha": threshold = 0 threshold_label = [1 if m >= threshold else 0 for m in model_label] prc = precision_score(y_true=true_label, y_pred=threshold_label) rc = recall_score(y_true=true_label, y_pred=threshold_label) print threshold, prc, rc exit() if __name__ == "__main__": path_data = "./satisfy_typediff_sorted.out" commits_ = extract_commit_july(path_file=path_data) print len(commits_), type(commits_) commits_id = [c["id"] for c in commits_] print len(commits_id) path_file = "./statistical_test_prob_ver2/true_label.txt" true_label = load_file(path_file=path_file) true_label = [float(t) for t in true_label] path_file = "./statistical_test_prob_ver2/PatchNet.txt" patchNet = load_file(path_file=path_file) patchNet = [float(t) for t in patchNet] checking_performance(id_label=commits_id, true_label=true_label, model_label=patchNet, model_name="patchNet")
if __name__ == "__main__": # root_path_ = "./sasha_results/" # files_path = ([root_path_ + x for x in os.listdir(root_path_)]) # print files_path files_path = list() files_path.append("./sasha_results/fold1.txt") files_path.append("./sasha_results/fold2.txt") files_path.append("./sasha_results/fold3.txt") files_path.append("./sasha_results/fold4.txt") files_path.append("./sasha_results/fold5.txt") root_gt = "./satisfy_typediff_sorted.out" commits_ = extract_commit_july(path_file=root_gt) commits_id_ = [c["id"] for c in commits_] commits_label_ = [c["stable"] for c in commits_] dl_labels = load_file("./statistical_test/lstm_cnn_all.txt") dl_labels = [float(l) for l in dl_labels] fold_index = split_train_test(data=commits_, folds=5, random_state=None) accuracy_, precision_, recall_, f1_, auc_ = list(), list(), list(), list( ), list() probs, true_pos, false_neg = list(), list(), list() threshold = 0.5 for f in files_path: print f test_fold = fold_index[check_fold(string=f) - 1]["test"]
def loading_baseline_july(tf, folds, random_state): FLAGS = tf.flags.FLAGS commits_ = extract_commit_july(path_file=FLAGS.path) filter_commits = commits_ print len(commits_) kf = KFold(n_splits=folds, random_state=random_state) idx_folds = list() for train_index, test_index in kf.split(filter_commits): idx = dict() idx["train"], idx["test"] = train_index, test_index idx_folds.append(idx) if "msg" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) elif "all" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif "code" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) msgs_ = codes_ else: print "You need to type correct model" exit() dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) labels = load_label_commits(commits=filter_commits) labels = convert_to_binary(labels) # path_file = "./statistical_test_prob/true_label.txt" # write_file(path_file=path_file, data=labels) # exit() print pad_msg.shape, labels.shape, len(dict_msg_) cntfold = 0 pred_dict = dict() pred_dict_list = list() for i in xrange(cntfold, len(idx_folds)): idx = idx_folds[i] train_index, test_index = idx["train"], idx["test"] X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \ np.array(get_items(items=pad_msg, indexes=test_index)) Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \ np.array(get_items(items=labels, indexes=test_index)) if FLAGS.model == "lstm_cnn_all" or FLAGS.model == "lstm_cnn_msg" \ or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "cnn_all" \ or FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code": # path_model = "./keras_model/%s_%s.h5" % (FLAGS.model, str(cntfold)) path_model = "./keras_model/test_%s_%s.h5" % (FLAGS.model, str(cntfold)) # path_model = "./keras_model/%s_%s_testing.h5" % (FLAGS.model, str(cntfold)) model = load_model(path_model) else: print "You need to give correct model name" exit() y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size) y_pred = np.ravel(y_pred) pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index)) y_pred = y_pred.tolist() pred_dict_list += y_pred # print len(pred_dict_list) # exit() # path_file = "./statistical_test_prob/" + FLAGS.model + ".txt" # write_file(path_file=path_file, data=sorted_dict(dict=pred_dict)) path_file = "./statistical_test_prob/" + FLAGS.model + "_checking.txt" write_file(path_file=path_file, data=pred_dict_list)
def running_baseline_july(tf, folds, random_state): FLAGS = tf.flags.FLAGS commits_ = extract_commit_july(path_file=FLAGS.path) filter_commits = commits_ print len(commits_) kf = KFold(n_splits=folds, random_state=random_state) idx_folds = list() for train_index, test_index in kf.split(filter_commits): idx = dict() idx["train"], idx["test"] = train_index, test_index idx_folds.append(idx) if "msg" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) elif "all" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif "code" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) msgs_ = codes_ else: print "You need to type correct model" exit() dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) labels = load_label_commits(commits=filter_commits) labels = convert_to_binary(labels) print pad_msg.shape, labels.shape, len(dict_msg_) # exit() timestamp = str(int(time.time())) accuracy, precision, recall, f1, auc = list(), list(), list(), list(), list() cntfold = 0 pred_dict, pred_dict_prob = dict(), dict() for i in xrange(cntfold, len(idx_folds)): idx = idx_folds[i] train_index, test_index = idx["train"], idx["test"] X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \ np.array(get_items(items=pad_msg, indexes=test_index)) Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \ np.array(get_items(items=labels, indexes=test_index)) if FLAGS.model == "lstm_cnn_msg" or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "lstm_cnn_all": model = lstm_cnn(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg, y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS) elif FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code" or FLAGS.model == "cnn_all": model = cnn_model(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg, y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS) else: print "You need to give correct model name" exit() # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + ".h5") # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + "_testing.h5") # model.save("./keras_model/test_" + FLAGS.model + "_" + str(cntfold) + ".h5") model.save("./keras_model/newres_funcalls_" + FLAGS.model + "_" + str(cntfold) + ".h5") y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size) y_pred = np.ravel(y_pred) y_pred_tolist = y_pred.tolist() data_fold = [str(i) + "\t" + str(l) for i, l in zip(test_index, y_pred)] path_file = "./statistical_test/newres_funcalls_%s_fold_%s.txt" % (FLAGS.model, str(cntfold)) write_file(path_file=path_file, data=data_fold) y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index)) accuracy.append(accuracy_score(y_true=Y_test, y_pred=y_pred)) precision.append(precision_score(y_true=Y_test, y_pred=y_pred)) recall.append(recall_score(y_true=Y_test, y_pred=y_pred)) f1.append(f1_score(y_true=Y_test, y_pred=y_pred)) auc.append(auc_score(y_true=Y_test, y_pred=y_pred)) print "accuracy", accuracy_score(y_true=Y_test, y_pred=y_pred) print "precision", precision_score(y_true=Y_test, y_pred=y_pred) print "recall", recall_score(y_true=Y_test, y_pred=y_pred) print "f1", f1_score(y_true=Y_test, y_pred=y_pred) cntfold += 1 break
# sys.path.append(path_working) from init_params import model_parameters from keras_lstm import print_params from ultis import extract_commit_july from keras_lstm import lstm_cnn, cnn_model from baselines import extract_msg, extract_code, add_two_list from data_helpers import dictionary, mapping_commit_msg, load_label_commits from data_helpers import convert_to_binary tf_ = model_parameters() FLAGS_ = tf_.flags.FLAGS print_params(tf_) FLAGS = FLAGS_ commits_ = extract_commit_july(path_file=FLAGS.path) filter_commits = commits_ if "msg" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) elif "all" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif "code" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) msgs_ = codes_ else: