Ejemplo n.º 1
0
def file_stats():
    mls = ['dt', 'rf']  # , 'svm', 'nb']
    for window in windows:
        writer = pytablewriter.MarkdownTableWriter()
        writer.table_name = 'File Accuracy for {}s'.format(window)
        writer.header_list = ['File', 'Decision Tree', 'Random Forest',
                              'Tensorflow']
        value_matrix = []
        for name in binet_files:
            values = [name]
            feature, label = get_feature_labels(get_saved_data(window, name,
                                                               v2=True),
                                                               v2=True)
            # feature = mask_features(feature)
            feat_train, feat_test, label_train, label_test = train_test_split(
                feature, label, test_size=0.3, random_state=42)
            for ml in mls:
                r = train_and_test_with(feat_train, label_train, ml, feat_test,
                                        label_test)
                values.append('{0:.4f}, {1:.4f}, {2:.4f}'.format(
                              r['accuracy'],
                              r['precision'],
                              r['recall']))
                print(values)
            correctness, precision, recall = \
                keras_train_and_test(feat_train, label_train,
                                     feat_test, label_test, dimension=22)
            values.append('{0:.4f}, {1:.4f}, {2:.4f}'.format(correctness,
                                                               precision,
                                                               recall))
            print(values)
            value_matrix.append(values)

        writer.value_matrix = value_matrix
        writer.write_table()
Ejemplo n.º 2
0
def get_balance():
    for binet in binet_files:
        summary = get_saved_data(0.15, binet)
        _, label = get_feature_labels(summary)
        attacks = sum(label)
        nonattacks = len(label) - attacks
        print("{} | {} ".format(attacks, nonattacks))
Ejemplo n.º 3
0
def shuffle_data_test():
    binet = binet_files[-1]
    feature, label = get_feature_labels(get_saved_data(0.15, binet))
    scores = []
    precs = []
    rec = []

    # do normal scoring
    # TODO: do same analysis with dt and rf
    acc, p, r = keras_train_and_test(feature, label)
    scores.append(acc)
    precs.append(p)
    rec.append(r)
    mstd = list(get_mean_std(feature))
    for i in range(1, 5):
        indices = [random.randrange(len(feature)) for _ in range(
            int(len(feature) * ((i*10)/100)))]
        f = feature[:]
        for index in indices:
            f[index] = [np.random.normal(*mstd[i]) for i in range(len(f[index]))]
        acc, p, r = keras_train_and_test(f, label)
        scores.append(acc)
        precs.append(p)
        rec.append(r)

    plt.figure()
    plt.plot(scores, color='lightblue', label='Accuracy')
    plt.plot(precs, color='red', label='precision')
    plt.plot(rec, color='green', label='recall')
    plt.ylabel("Score")
    plt.xlabel("\% of features randomized")
    plt.title("Score randomizing")
    plt.legend(loc='best')
    plt.show()
Ejemplo n.º 4
0
def feature_plotting():
    feature, label = get_feature_labels(get_saved_data(0.15, binet_files[12]))
    plt.figure()
    zeroes = set(zip(range(len(feature)), feature[:,9]))
    ones = set(z for z in zeroes if label[z[0]] == 1)
    del label
    del feature
    zeroes = zeroes.difference(ones)
    plt.scatter(*zip(*zeroes), s=1, c='gray')
    del zeroes
    plt.scatter(*zip(*ones),s=10, c='lightgreen')
    plt.show()
Ejemplo n.º 5
0
def stats_on_best():
    best = [8, 9, 12]
    summaries = []
    for b in best:
        summaries += get_saved_data(0.15, binet_files[b])
    feature, label = get_feature_labels(summaries)
    scores = []
    for i in range(1, 5):
        feature = [[random.randrange(-(i*10), i*10) for f in feat] for feat in feature]
        acc, _, _ = keras_train_and_test(feature, label)
        scores.append(acc)
    print(scores)
Ejemplo n.º 6
0
def kfold_test():
    mls = ['dt', 'rf']
    for window in windows:
        writer = pytablewriter.MarkdownTableWriter()
        writer.table_name = 'KFold validation'
        writer.header_list = ['File', 'Decision Tree', 'Random Forest',
                              'Tensorflow']
        value_matrix = []
        for name in binet_files:
            values = [name]
            feature, label = get_feature_labels(get_saved_data(window, name))
            feature = feature[:int(len(feature) * 10)]
            label = feature[:int(len(label) * 10)]
            kf = KFold(n_splits=10)

            # feature = mask_features(feature)
            for ml in mls:
                scores = []
                pr_scores = []
                for train, test in kf.split(feature):
                    clf = get_classifier(ml)
                    xtrain, ytrain = feature[train], label[train]
                    xtest, ytest = feature[test], label[test]
                    clf.fit(xtrain, ytrain)
                    test_predicts = clf.predict(xtest)
                    test_score = accuracy_score(ytest, test_predicts)

                    scores.append(test_score)
                    proba = clf.predict_proba(xtest)

                    precision, recall, pr_thresholds = precision_recall_curve(
                            ytest, proba[:, 1])
                    pr_scores.append(auc(recall, precision))
                values.append('{0:.4f}, {1:.4f}, {2:.4f}, {3:.4f}'.format(
                                    np.mean(scores), np.std(scores),
                                    np.mean(pr_scores), np.std(pr_scores)))
            kf = KFold(n_splits=10)
            accuracy = []  # , precision, recall = [], [], []
            for train_index, test_index in kf.split(feature):
                x_train, x_test = feature[train_index], feature[test_index]
                y_train, y_test = label[train_index], label[test_index]
                c, p, r = \
                    keras_train_and_test(x_train, y_train, x_test, y_test,
                                         dimension=12)
                accuracy.append(c)
                # precision.append(p)
                # recall.append(r)
            values.append('{0:.4f}, {1:.4f}'.format(np.mean(accuracy),
                                                    np.std(accuracy)))
            value_matrix.append(values)
        writer.value_matrix = value_matrix
        writer.write_table()
Ejemplo n.º 7
0
def window_shift(window):
    writer = pytablewriter.MarkdownTableWriter()
    writer.table_name = 'Window Shift Accuracy for {}s'.format(window)
    writer.header_list = ['File', 'Descision Tree', 'Random Forest']
    value_matrix = []
    for file_name in binet_files:
        values = []
        feature, label = get_feature_labels(
                get_saved_data(window, file_name))
        feature = mask_features(feature)
        values += [
            file_name,
            '{0:.4f}'.format(train_and_test_step(feature, label, 'dt', 1000)),
            '{0:.4f}'.format(train_and_test_step(feature, label, 'rf', 1000))]
        values.append(
            '{0:.4f}'.format(train_and_test_step(feature, label, 'tf', 1000)))
        value_matrix.append(values)
    writer.value_matrix = value_matrix
    writer.write_table()
Ejemplo n.º 8
0
def run_analysis_with(interval, file_name, start_time=None, use_pickle=True):
    if start_time is None:
        start_time = get_start_time_for(file_name)

    start = datetime.strptime(start_time, TIME_FORMAT)
    file_num = get_file_num(file_name)
    directory = 'runs_of_%ss/' % interval

    if not os.path.exists(directory):
        os.makedirs(directory)

    mls = ['dt', 'rf']

    print('starting %d %s' % (interval, file_name))
    if use_pickle:
        print('loading pickle')
        summaries = get_saved_data(interval, file_name)
        if summaries is None:
            print('failed to load pickle. Aggregating data')
            summaries = aggregate_file(interval, file_name, start)
            print('finished aggregating, pickling data...')
            pickle_summarized_data(interval, start_time, file_name, summaries)
            print('data pickled')
        else:
            print('loaded picke')
    else:
        print('aggregating data')
        summaries = aggregate_file(interval, file_name, start)
        print('finished aggregating, pickling data...')
        pickle_summarized_data(interval, start_time, file_name, summaries)
        print('data pickled')

    features, labels = get_feature_labels(summaries)
    for ml in mls:
        print('testing with %s' % ml)
        result = train_and_test_with(features, labels, ml)
        path = '%srun_%s_%s.txt' % (directory, file_num, ml)
        save_results(path, file_name, start_time, interval, result)