Esempio n. 1
0
def kfold_results(y_true, y_pred, nfolds):
    kf = KFold(n_splits=nfolds, random_state=0)
    accuracy, precision, recall, f1, auc = list(), list(), list(), list(
    ), list()
    for train_index, test_index in kf.split(y_true):
        y_true_test_index, y_pred_test_index = get_items(items=y_true, indexes=test_index), \
                                               get_items(items=y_pred, indexes=test_index)
        accuracy.append(
            accuracy_score(y_true=y_true_test_index, y_pred=y_pred_test_index))
        precision.append(
            precision_score(y_true=y_true_test_index,
                            y_pred=y_pred_test_index))
        recall.append(
            recall_score(y_true=y_true_test_index, y_pred=y_pred_test_index))
        f1.append(f1_score(y_true=y_true_test_index, y_pred=y_pred_test_index))
        auc.append(
            auc_score(y_true=y_true_test_index, y_pred=y_pred_test_index))

    algorithm = ""
    print "Accuracy and std of %s: %f %f" % (
        algorithm, np.mean(np.array(accuracy)), np.std(np.array(accuracy)))
    print "Precision of %s: %f %f" % (algorithm, np.mean(
        np.array(precision)), np.std(np.array(precision)))
    print "Recall of %s: %f %f" % (algorithm, np.mean(
        np.array(recall)), np.std(np.array(recall)))
    print "F1 of %s: %f %f" % (algorithm, np.mean(
        np.array(f1)), np.std(np.array(f1)))
    print "AUC of %s: %f %f" % (algorithm, np.mean(
        np.array(auc)), np.std(np.array(auc)))
Esempio n. 2
0
def get_predict(name, X, y, algorithm, folds):
    kf = KFold(n_splits=folds, random_state=0)
    kf.get_n_splits(X=X)
    auc, accuracy, precision, recall, f1 = list(), list(), list(), list(), list()
    pred_dict = dict()
    for train_index, test_index in kf.split(X):
        X_train, y_train = get_items(items=X, indexes=train_index), get_items(items=y, indexes=train_index)
        X_test, y_test = get_items(items=X, indexes=test_index), get_items(items=y, indexes=test_index)

        vectorizer = CountVectorizer()
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)

        if algorithm == "svm":
            clf = LinearSVC(random_state=0)
        elif algorithm == "lr":
            clf = LogisticRegression()
        elif algorithm == "dt":
            clf = DecisionTreeClassifier()
        else:
            print "Wrong algorithm name -- please retype again"
            exit()

        clf.fit(X=X_train, y=y_train)
        y_pred = clf.predict(X_test)
        pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index))
        # path_file = "./statistical_test/" + name + "_" + algorithm + ".txt"
        # path_file = "./statistical_test/3_mar7/" + name + "_" + algorithm + ".txt"
        # path_file = "./statistical_test_ver2/3_mar7" + name + "_" + algorithm + ".txt"
        # write_file(path_file, y_pred)
        accuracy.append(accuracy_score(y_true=y_test, y_pred=y_pred))
        precision.append(precision_score(y_true=y_test, y_pred=y_pred))
        recall.append(recall_score(y_true=y_test, y_pred=y_pred))
        f1.append(f1_score(y_true=y_test, y_pred=y_pred))
        auc.append(auc_score(y_true=y_test, y_pred=y_pred))

    path_file = "./statistical_test_ver2/3_mar7/" + name + "_" + algorithm + ".txt"
    write_file(path_file=path_file, data=sorted_dict(dict=pred_dict))
    print "Accuracy and std of %s: %f %f" % (algorithm, np.mean(np.array(accuracy)), np.std(np.array(accuracy)))
    print "Precision of %s: %f %f" % (algorithm, np.mean(np.array(precision)), np.std(np.array(precision)))
    print "Recall of %s: %f %f" % (algorithm, np.mean(np.array(recall)), np.std(np.array(recall)))
    print "F1 of %s: %f %f" % (algorithm, np.mean(np.array(f1)), np.std(np.array(f1)))
    print "AUC of %s: %f %f" % (algorithm, np.mean(np.array(auc)), np.std(np.array(auc)))
Esempio n. 3
0
def loading_data_lstm(FLAGS):
    print FLAGS.model
    if "msg" in FLAGS.model:
        commits_ = extract_commit(path_file=FLAGS.path)
        filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
        pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
        pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                             max_code_line=FLAGS.code_line,
                                             max_code_length=FLAGS.code_length, dict_code=dict_code_)
        pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                               max_code_line=FLAGS.code_line,
                                               max_code_length=FLAGS.code_length, dict_code=dict_code_)
        labels = load_label_commits(commits=filter_commits)
    elif "all" in FLAGS.model:
        commits_ = extract_commit(path_file=FLAGS.path)
        filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines
        dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
        pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
        pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                             max_code_line=FLAGS.code_line,
                                             max_code_length=FLAGS.code_length, dict_code=dict_code_)
        pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                               max_code_line=FLAGS.code_line,
                                               max_code_length=FLAGS.code_length, dict_code=dict_code_)
        labels = load_label_commits(commits=filter_commits)
    elif "code" in FLAGS.model:
        commits_ = extract_commit(path_file=FLAGS.path)
        filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        msgs_ = codes_
        dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
        pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
        pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                             max_code_line=FLAGS.code_line,
                                             max_code_length=FLAGS.code_length, dict_code=dict_code_)
        pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                               max_code_line=FLAGS.code_line,
                                               max_code_length=FLAGS.code_length, dict_code=dict_code_)
        labels = load_label_commits(commits=filter_commits)
    else:
        print "You need to type correct model"
        exit()

    kf = KFold(n_splits=FLAGS.folds, random_state=FLAGS.seed)
    for train_index, test_index in kf.split(filter_commits):
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        X_train_added_code, X_test_added_code = np.array(get_items(items=pad_added_code, indexes=train_index)), \
                                                np.array(get_items(items=pad_added_code, indexes=test_index))
        X_train_removed_code, X_test_removed_code = np.array(get_items(items=pad_removed_code, indexes=train_index)), \
                                                    np.array(get_items(items=pad_removed_code, indexes=test_index))
        y_train, y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))
        return X_test_msg, X_test_added_code, X_test_removed_code, y_test
Esempio n. 4
0
def eval_PatchNet_fold(tf, checkpoint_dir, fold_num, fold_index, pad_msg,
                       pad_added_code, pad_removed_code, labels):
    FLAGS = tf.flags.FLAGS
    allow_soft_placement = True  # "Allow device soft device placement"
    log_device_placement = False  # "Log placement of ops on devices"

    train_index, test_index = fold_index["train"], fold_index["test"]
    X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                              np.array(get_items(items=pad_msg, indexes=test_index))
    X_train_added_code, X_test_added_code = np.array(get_items(items=pad_added_code, indexes=train_index)), \
                                            np.array(get_items(items=pad_added_code, indexes=test_index))
    X_train_removed_code, X_test_removed_code = np.array(get_items(items=pad_removed_code, indexes=train_index)), \
                                                np.array(get_items(items=pad_removed_code, indexes=test_index))
    y_train, y_test = np.array(get_items(items=labels, indexes=train_index)), \
                      np.array(get_items(items=labels, indexes=test_index))
    print X_train_msg.shape, X_test_msg.shape
    print X_train_added_code.shape, X_test_added_code.shape
    print X_train_removed_code.shape, X_test_removed_code.shape
    print y_train.shape, y_test.shape

    dirs = get_all_checkpoints(checkpoint_dir=checkpoint_dir)
    graph = tf.Graph()
    for checkpoint_file in dirs:
        # checkpoint_file = checkpoint_file.replace("thonghoang", "jameshoang")
        # checkpoint_file = checkpoint_file.replace("runs", "runs_all_512_[1, 2]_32")
        # print checkpoint_file
        with graph.as_default():
            session_conf = tf.ConfigProto(
                allow_soft_placement=allow_soft_placement,
                log_device_placement=log_device_placement)
            sess = tf.Session(config=session_conf)

            with sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)

                # Get the placeholders from the graph by name
                input_msg = graph.get_operation_by_name("input_msg").outputs[0]
                input_addedcode = graph.get_operation_by_name(
                    "input_addedcode").outputs[0]
                input_removedcode = graph.get_operation_by_name(
                    "input_removedcode").outputs[0]
                dropout_keep_prob = graph.get_operation_by_name(
                    "dropout_keep_prob").outputs[0]

                # Tensors we want to evaluate
                predictions = graph.get_operation_by_name(
                    "output/predictions").outputs[0]
                scores = graph.get_operation_by_name(
                    "output/scores").outputs[0]

                # Generate batches for one epoch
                batches = mini_batches(X_msg=X_test_msg,
                                       X_added_code=X_test_added_code,
                                       X_removed_code=X_test_removed_code,
                                       Y=y_test,
                                       mini_batch_size=FLAGS.batch_size)

                # Collect the predictions here
                all_predictions, all_scores = [], []

                for batch in batches:
                    batch_input_msg, batch_input_added_code, batch_input_removed_code, batch_input_labels = batch
                    batch_predictions = sess.run(
                        predictions, {
                            input_msg: batch_input_msg,
                            input_addedcode: batch_input_added_code,
                            input_removedcode: batch_input_removed_code,
                            dropout_keep_prob: 1.0
                        })
                    # print batch_predictions.shape
                    all_predictions = np.concatenate(
                        [all_predictions, batch_predictions])

                    batch_scores = sess.run(
                        scores, {
                            input_msg: batch_input_msg,
                            input_addedcode: batch_input_added_code,
                            input_removedcode: batch_input_removed_code,
                            dropout_keep_prob: 1.0
                        })
                    batch_scores = np.ravel(softmax(batch_scores)[:, [1]])
                    # print batch_scores.shape
                    all_scores = np.concatenate([all_scores, batch_scores])
        split_checkpoint_file = checkpoint_file.split("/")
        path_write = "./patchNet_results/%s_%s.txt" % (
            split_checkpoint_file[-3], split_checkpoint_file[-1])
        write_file(path_file=path_write, data=all_scores)

        print checkpoint_file, "Accuracy:", accuracy_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print checkpoint_file, "Precision:", precision_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print checkpoint_file, "Recall:", recall_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print checkpoint_file, "F1:", f1_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print checkpoint_file, "AUC:", auc_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print "\n"
Esempio n. 5
0
    commits_label_ = [c["stable"] for c in commits_]

    dl_labels = load_file("./statistical_test/lstm_cnn_all.txt")
    dl_labels = [float(l) for l in dl_labels]

    fold_index = split_train_test(data=commits_, folds=5, random_state=None)

    accuracy_, precision_, recall_, f1_, auc_ = list(), list(), list(), list(
    ), list()
    probs, true_pos, false_neg = list(), list(), list()
    threshold = 0.5
    for f in files_path:
        print f
        test_fold = fold_index[check_fold(string=f) - 1]["test"]
        test_commit_id, test_commit_label = get_items(
            items=commits_id_,
            indexes=test_fold), get_items(items=commits_label_,
                                          indexes=test_fold)
        test_dl_labels = get_items(items=dl_labels, indexes=test_fold)
        print len(test_commit_id), len(test_commit_label), len(test_dl_labels)

        # acc, prc, rc, f1, auc, prob, tp, fn = load_results(id_gt=test_commit_id, label_gt=test_commit_label,
        #                                                    single_file=f)
        acc, prc, rc, f1, auc, prob, tp, fn = load_results(
            id_gt=test_commit_id,
            label_gt=test_commit_label,
            single_file=f,
            threshold=threshold)
        accuracy_.append(acc)
        precision_.append(prc)
        recall_.append(rc)
    # path_pred = "./statistical_test_prob/lstm_cnn_all_fold_0.txt"
    # path_pred = "./statistical_test_prob/lstm_cnn_all_check_fold_0.txt"
    # path_pred = "./statistical_test_prob/lstm_cnn_all_checking.txt"
    # path_pred = "./statistical_test/lstm_cnn_all_ver2.txt"
    # path_pred = "./statistical_test_prob_ver3/PatchNet.txt"
    # path_pred = "./statistical_test_prob_ver3/LPU-SVM.txt"
    path_pred = "./statistical_test_prob_ver3/LS-CNN.txt"
    # path_pred, threshold = "./statistical_test_prob_ver3/sasha_results.txt", 50
    y_pred = load_file(path_file=path_pred)
    if "sasha" in path_pred:
        y_pred = np.array([1 if float(y) > threshold else 0 for y in y_pred])
    else:
        y_pred = np.array([1 if float(y) > 0.5 else 0 for y in y_pred])
        # y_pred = np.array([float(y) for y in y_pred])
        # y_pred[y_pred > 0.5] = 1
        # y_pred[y_pred <= 0.5] = 0

    split_data = split_train_test(data=y_true, folds=folds, random_state=None)

    for i in xrange(len(split_data)):
        train_index, test_index = split_data[i]["train"], split_data[i]["test"]
        y_true_, y_pred_ = get_items(
            items=y_true, indexes=test_index), get_items(items=y_pred,
                                                         indexes=test_index)
        acc = accuracy_score(y_true=y_true_, y_pred=y_pred_)
        prc = precision_score(y_true=y_true_, y_pred=y_pred_)
        rc = recall_score(y_true=y_true_, y_pred=y_pred_)
        f1 = f1_score(y_true=y_true_, y_pred=y_pred_)
        auc = auc_score(y_true=y_true_, y_pred=y_pred_)
        print acc, prc, rc, f1, auc
Esempio n. 7
0
def training_model(tf, timestamp, fold_num, fold_index, pad_msg,
                   pad_added_code, pad_removed_code, labels, dict_msg,
                   dict_code):
    FLAGS = tf.flags.FLAGS
    train_index, test_index = fold_index["train"], fold_index["test"]
    X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                              np.array(get_items(items=pad_msg, indexes=test_index))
    X_train_added_code, X_test_added_code = np.array(get_items(items=pad_added_code, indexes=train_index)), \
                                            np.array(get_items(items=pad_added_code, indexes=test_index))
    X_train_removed_code, X_test_removed_code = np.array(get_items(items=pad_removed_code, indexes=train_index)), \
                                                np.array(get_items(items=pad_removed_code, indexes=test_index))
    y_train, y_test = np.array(get_items(items=labels, indexes=train_index)), \
                      np.array(get_items(items=labels, indexes=test_index))
    # y_train, y_test = convert_to_binary(labels), convert_to_binary(labels)
    # y_train, y_test = y_train.reshape((len(labels), 1)), y_test.reshape((len(labels), 1))

    print X_train_msg.shape, X_test_msg.shape
    print X_train_added_code.shape, X_test_added_code.shape
    print X_train_removed_code.shape, X_test_removed_code.shape
    print y_train.shape, y_test.shape

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            model = PatchNet(max_msg_length=FLAGS.msg_length,
                             max_code_length=FLAGS.code_length,
                             max_code_line=FLAGS.code_line,
                             max_code_hunk=FLAGS.code_hunk,
                             vocab_size_text=len(dict_msg),
                             vocab_size_code=len(dict_code),
                             embedding_size_text=FLAGS.embedding_dim_text,
                             filter_sizes=list(
                                 map(int, FLAGS.filter_sizes.split(","))),
                             num_filters=FLAGS.num_filters,
                             l2_reg_lambda=FLAGS.l2_reg_lambda,
                             num_classes=y_train.shape[1],
                             hidden_units=FLAGS.hidden_units)
            model.build_graph(model=FLAGS.model)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            test_step = 0
            optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
            grads_and_vars = optimizer.compute_gradients(model.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", "fold_" + str(fold_num) + "_" + timestamp))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs",
                             timestamp + "_" + "fold_" + str(fold_num)))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", model.loss)
            acc_summary = tf.summary.scalar("accuracy", model.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev Summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(input_msg, input_added_code, input_removed_code,
                           input_labels):
                """
                A training step
                """
                feed_dict = {
                    model.input_msg: input_msg,
                    model.input_addedcode: input_added_code,
                    model.input_removedcode: input_removed_code,
                    model.input_y: input_labels,
                    model.dropout_keep_prob: FLAGS.dropout_keep_prob
                }

                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, model.loss,
                    model.accuracy
                ], feed_dict)

                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(input_msg, input_added_code, input_removed_code,
                         input_labels, step, len_train_batch):
                """
                A testing step
                """
                mini_batches = random_mini_batch(
                    X_msg=input_msg,
                    X_added_code=input_added_code,
                    X_removed_code=input_removed_code,
                    Y=input_labels,
                    mini_batch_size=FLAGS.batch_size)
                slope = len_train_batch / float(len(mini_batches))
                accs, losses = list(), list()
                for batch in mini_batches:
                    test_input_msg, test_input_added_code, test_input_removed_code, test_input_labels = batch
                    feed_dict = {
                        model.input_msg: test_input_msg,
                        model.input_addedcode: test_input_added_code,
                        model.input_removedcode: test_input_removed_code,
                        model.input_y: test_input_labels,
                        model.dropout_keep_prob: 1.0
                    }

                    summaries, loss, accuracy = sess.run(
                        [dev_summary_op, model.loss, model.accuracy],
                        feed_dict)
                    accs.append(accuracy)
                    losses.append(loss)
                    if step * FLAGS.folds == 0:
                        dev_summary_writer.add_summary(summaries, 1)
                        # print "step {}".format(1)
                    else:
                        dev_summary_writer.add_summary(summaries,
                                                       step * slope + 1)
                        # print "step {}".format(step * slope)
                    step += 1

                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step,
                    sum(losses) / float(len(losses)),
                    sum(accs) / float(len(accs))))
                return step

        for i in xrange(0, FLAGS.num_epochs):
            # Generate batches
            mini_batches = random_mini_batch(
                X_msg=X_train_msg,
                X_added_code=X_train_added_code,
                X_removed_code=X_train_removed_code,
                Y=y_train,
                mini_batch_size=FLAGS.batch_size)
            saving_step = int(len(mini_batches) / 3)
            for j in xrange(len(mini_batches)):
                batch = mini_batches[j]
                input_msg, input_added_code, input_removed_code, input_labels = batch
                train_step(input_msg, input_added_code, input_removed_code,
                           input_labels)
                current_step = tf.train.global_step(sess, global_step)
                if j == (len(mini_batches) - 1):
                    print "\nEpoch:%i" % i
                    print("\nEvaluation:")
                    test_step = dev_step(
                        input_msg=X_test_msg,
                        input_added_code=X_test_added_code,
                        input_removed_code=X_test_removed_code,
                        input_labels=y_test,
                        step=test_step,
                        len_train_batch=len(mini_batches))
                    print("")
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print "Saved model checkpoint to {}\n".format(path)
                elif (j + 1) % saving_step == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print "Saved model checkpoint to {}\n".format(path)
Esempio n. 8
0
def loading_baseline_july(tf, folds, random_state):
    FLAGS = tf.flags.FLAGS
    commits_ = extract_commit_july(path_file=FLAGS.path)
    filter_commits = commits_
    print len(commits_)

    kf = KFold(n_splits=folds, random_state=random_state)
    idx_folds = list()
    for train_index, test_index in kf.split(filter_commits):
        idx = dict()
        idx["train"], idx["test"] = train_index, test_index
        idx_folds.append(idx)

    if "msg" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
    elif "all" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines
    elif "code" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
        msgs_ = codes_
    else:
        print "You need to type correct model"
        exit()
    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    pad_msg = mapping_commit_msg(msgs=msgs_,
                                 max_length=FLAGS.msg_length,
                                 dict_msg=dict_msg_)
    labels = load_label_commits(commits=filter_commits)
    labels = convert_to_binary(labels)

    # path_file = "./statistical_test_prob/true_label.txt"
    # write_file(path_file=path_file, data=labels)
    # exit()

    print pad_msg.shape, labels.shape, len(dict_msg_)
    cntfold = 0
    pred_dict = dict()
    pred_dict_list = list()
    for i in xrange(cntfold, len(idx_folds)):
        idx = idx_folds[i]
        train_index, test_index = idx["train"], idx["test"]
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))
        if FLAGS.model == "lstm_cnn_all" or FLAGS.model == "lstm_cnn_msg" \
                or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "cnn_all" \
                or FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code":
            # path_model = "./keras_model/%s_%s.h5" % (FLAGS.model, str(cntfold))
            path_model = "./keras_model/test_%s_%s.h5" % (FLAGS.model,
                                                          str(cntfold))
            # path_model = "./keras_model/%s_%s_testing.h5" % (FLAGS.model, str(cntfold))
            model = load_model(path_model)
        else:
            print "You need to give correct model name"
            exit()
        y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size)
        y_pred = np.ravel(y_pred)

        pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index))

        y_pred = y_pred.tolist()
        pred_dict_list += y_pred
    # print len(pred_dict_list)
    # exit()
    # path_file = "./statistical_test_prob/" + FLAGS.model + ".txt"
    # write_file(path_file=path_file, data=sorted_dict(dict=pred_dict))
    path_file = "./statistical_test_prob/" + FLAGS.model + "_checking.txt"
    write_file(path_file=path_file, data=pred_dict_list)
Esempio n. 9
0
    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    pad_msg = mapping_commit_msg(msgs=msgs_,
                                 max_length=FLAGS.msg_length,
                                 dict_msg=dict_msg_)
    labels = load_label_commits(commits=filter_commits)
    labels = convert_to_binary(labels)
    print pad_msg.shape, labels.shape, len(dict_msg_)
    folds = 10
    kf = KFold(n_splits=folds, random_state=FLAGS.seed)
    cntfold = 0
    timestamp = str(int(time.time()))
    accuracy, precision, recall, f1, auc = list(), list(), list(), list(
    ), list()
    pred_dict = dict()
    for train_index, test_index in kf.split(filter_commits):
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))

        if FLAGS.model == "lstm_msg" or FLAGS.model == "lstm_code" or FLAGS.model == "lstm_all":
            model = lstm_model(x_train=X_train_msg,
                               y_train=Y_train,
                               x_test=X_test_msg,
                               y_test=Y_test,
                               dictionary_size=len(dict_msg_),
                               FLAGS=FLAGS)
        elif FLAGS.model == "bi_lstm_msg" or FLAGS.model == "bi_lstm_code" or FLAGS.model == "bi_lstm_all":
            model = bi_lstm_model(x_train=X_train_msg,
                                  y_train=Y_train,
                                  x_test=X_test_msg,
Esempio n. 10
0
def running_baseline_july(tf, folds, random_state):
    FLAGS = tf.flags.FLAGS
    commits_ = extract_commit_july(path_file=FLAGS.path)
    filter_commits = commits_
    print len(commits_)
    kf = KFold(n_splits=folds, random_state=random_state)
    idx_folds = list()
    for train_index, test_index in kf.split(filter_commits):
        idx = dict()
        idx["train"], idx["test"] = train_index, test_index
        idx_folds.append(idx)

    if "msg" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
    elif "all" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines
    elif "code" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        msgs_ = codes_
    else:
        print "You need to type correct model"
        exit()

    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
    labels = load_label_commits(commits=filter_commits)
    labels = convert_to_binary(labels)
    print pad_msg.shape, labels.shape, len(dict_msg_)
    # exit()

    timestamp = str(int(time.time()))
    accuracy, precision, recall, f1, auc = list(), list(), list(), list(), list()
    cntfold = 0
    pred_dict, pred_dict_prob = dict(), dict()
    for i in xrange(cntfold, len(idx_folds)):
        idx = idx_folds[i]
        train_index, test_index = idx["train"], idx["test"]
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))
        if FLAGS.model == "lstm_cnn_msg" or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "lstm_cnn_all":
            model = lstm_cnn(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg,
                             y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS)
        elif FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code" or FLAGS.model == "cnn_all":
            model = cnn_model(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg,
                              y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS)
        else:
            print "You need to give correct model name"
            exit()

        # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + ".h5")
        # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + "_testing.h5")
        # model.save("./keras_model/test_" + FLAGS.model + "_" + str(cntfold) + ".h5")
        model.save("./keras_model/newres_funcalls_" + FLAGS.model + "_" + str(cntfold) + ".h5")

        y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size)
        y_pred = np.ravel(y_pred)

        y_pred_tolist = y_pred.tolist()
        data_fold = [str(i) + "\t" + str(l) for i, l in zip(test_index, y_pred)]
        path_file = "./statistical_test/newres_funcalls_%s_fold_%s.txt" % (FLAGS.model, str(cntfold))
        write_file(path_file=path_file, data=data_fold)

        y_pred[y_pred > 0.5] = 1
        y_pred[y_pred <= 0.5] = 0

        pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index))
        accuracy.append(accuracy_score(y_true=Y_test, y_pred=y_pred))
        precision.append(precision_score(y_true=Y_test, y_pred=y_pred))
        recall.append(recall_score(y_true=Y_test, y_pred=y_pred))
        f1.append(f1_score(y_true=Y_test, y_pred=y_pred))
        auc.append(auc_score(y_true=Y_test, y_pred=y_pred))
        print "accuracy", accuracy_score(y_true=Y_test, y_pred=y_pred)
        print "precision", precision_score(y_true=Y_test, y_pred=y_pred)
        print "recall", recall_score(y_true=Y_test, y_pred=y_pred)
        print "f1", f1_score(y_true=Y_test, y_pred=y_pred)

        cntfold += 1
        break
Esempio n. 11
0
def cross_validation_ver2(id, X, y, algorithm, folds):
    kf = KFold(n_splits=folds, random_state=None)
    kf.get_n_splits(X=X)
    accuracy, precision, recall, f1 = list(), list(), list(), list()
    probs = list()
    for train_index, test_index in kf.split(X):
        X_train, y_train = get_items(items=X, indexes=train_index), get_items(
            items=y, indexes=train_index)
        X_test, y_test = get_items(items=X, indexes=test_index), get_items(
            items=y, indexes=test_index)
        id_train, id_test = get_items(
            items=id, indexes=train_index), get_items(items=id,
                                                      indexes=test_index)

        vectorizer = CountVectorizer()
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        # X = vectorizer.transform(X)

        # eval_train, eval_labels = loading_data("./data/3_mar7/typeaddres.out")
        # eval_train = vectorizer.transform(eval_train)

        if algorithm == "svm":
            clf = LinearSVC(random_state=0)
        elif algorithm == "lr":
            clf = LogisticRegression()
        elif algorithm == "dt":
            clf = DecisionTreeClassifier()
        else:
            print "Wrong algorithm name -- please retype again"
            exit()

        clf.fit(X=X_train, y=y_train)
        accuracy.append(
            accuracy_score(y_true=y_test, y_pred=clf.predict(X_test)))
        precision.append(
            precision_score(y_true=y_test, y_pred=clf.predict(X_test)))
        recall.append(recall_score(y_true=y_test, y_pred=clf.predict(X_test)))
        f1.append(f1_score(y_true=y_test, y_pred=clf.predict(X_test)))
        # print accuracy, precision, recall, f1

        # print X_test.shape
        # y_pred = clf.predict(X_test)
        # y_pred_proba = clf.predict_proba(X_test)[:, 1]
        # y_pred_log_proba = clf.predict_log_proba(X_test)
        # print clf.predict_proba(X_test).shape
        # print clf.predict_log_proba(X_test).shape
        # exit()
        # probs += clf.predict_proba(X_test)[:, 1]
        probs = np.concatenate((probs, clf.predict_proba(X_test)[:, 1]),
                               axis=0)

        # accuracy.append(accuracy_score(y_true=eval_labels, y_pred=clf.predict(eval_train)))
        # precision.append(precision_score(y_true=eval_labels, y_pred=clf.predict(eval_train)))
        # recall.append(recall_score(y_true=eval_labels, y_pred=clf.predict(eval_train)))
        # f1.append(f1_score(y_true=eval_labels, y_pred=clf.predict(eval_train)))
        # break

    print accuracy, "Accuracy of %s: %f" % (algorithm, avg_list(accuracy))
    print precision, "Precision of %s: %f" % (algorithm, avg_list(precision))
    print recall, "Recall of %s: %f" % (algorithm, avg_list(recall))
    print f1, "F1 of %s: %f" % (algorithm, avg_list(f1))

    path_write = "./statistical_test_prob/%s.txt" % (algorithm)
    write_file(path_file=path_write, data=probs)
    print len(probs)