コード例 #1
0
def neural_net_cancer(solver):
    cancer_data = load_data_set('breastcancer')
    cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
    cancer_imp.fit(
        np.array(cancer_data['train']['inputs'] +
                 cancer_data['test']['inputs'],
                 dtype=np.float32))

    clf = neural_network.MLPClassifier(solver=solver,
                                       warm_start=True,
                                       max_iter=1000)

    with Timer() as t:
        clf.fit(cancer_imp.transform(cancer_data['train']['inputs']),
                cancer_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(
        cancer_imp.transform(cancer_data['train']['inputs']))
    train_f1_score = metrics.f1_score(cancer_data['train']['outputs'],
                                      predicted,
                                      average='micro')

    with Timer() as t:
        predicted = clf.predict(
            cancer_imp.transform(cancer_data['test']['inputs']))
    test_f1_score = metrics.f1_score(cancer_data['test']['outputs'],
                                     predicted,
                                     average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = cancer_imp.transform(cancer_data['train']['inputs'] +
                                   cancer_data['test']['inputs'])
    data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs']

    t_out = cancer_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("breastcancer.dataset (solver={})".format(solver))
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf,
        data_in,
        data_out,
        title="Learning Curve: Neural Net (breastcancer.dataset, solver={})".
        format(solver),
        cv=5)
    plt.savefig('out/neural_net/breastcancer-solver-{}.png'.format(solver))
コード例 #2
0
def neural_net_car(solver):
    car_data = load_data_set('car')
    car_ohe = preprocessing.OneHotEncoder()
    car_ohe.fit(car_data['train']['inputs'] +
                car_data['test']['inputs'])  # encode features as one-hot

    clf = neural_network.MLPClassifier(solver=solver,
                                       warm_start=True,
                                       max_iter=1000)

    with Timer() as t:
        clf.fit(car_ohe.transform(car_data['train']['inputs']),
                car_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(car_ohe.transform(car_data['train']['inputs']))
    train_f1_score = metrics.f1_score(car_data['train']['outputs'],
                                      predicted,
                                      average='micro')

    with Timer() as t:
        predicted = clf.predict(car_ohe.transform(car_data['test']['inputs']))
    test_f1_score = metrics.f1_score(car_data['test']['outputs'],
                                     predicted,
                                     average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = car_ohe.transform(car_data['train']['inputs'] +
                                car_data['test']['inputs'])
    data_out = car_data['train']['outputs'] + car_data['test']['outputs']

    t_out = car_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("car.dataset (solver={})".format(solver))
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf,
        data_in,
        data_out,
        title="Learning Curve: Neural Net (car.dataset, solver={})".format(
            solver),
        cv=5)
    plt.savefig('out/neural_net/car-solver-{}.png'.format(solver))
コード例 #3
0
def svm_car(kernel="linear"):
    car_data = load_data_set('car')
    car_ohe = preprocessing.OneHotEncoder()
    car_ohe.fit(car_data['train']['inputs'] + car_data['test']['inputs'])  # encode features as one-hot

    clf = svm.SVC(
        kernel=kernel
    )

    with Timer() as t:
        clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(car_ohe.transform(car_data['train']['inputs']))
    train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro')

    with Timer() as t:
        predicted = clf.predict(car_ohe.transform(car_data['test']['inputs']))
    test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs'])
    data_out = car_data['train']['outputs'] + car_data['test']['outputs']

    t_out = car_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("car.dataset (kernel={})".format(kernel))
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf, data_in, data_out, title="Learning Curve: SVM (car.dataset, kernel={})".format(kernel), cv=5)
    plt.savefig('out/svm/car-kernel-{}.png'.format(kernel))
コード例 #4
0
def svm_cancer(kernel="rbf"):
    cancer_data = load_data_set('breastcancer')
    cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
    cancer_imp.fit(np.array(cancer_data['train']['inputs'] + cancer_data['test']['inputs'], dtype=np.float32))

    clf = svm.SVC(
        kernel=kernel
    )

    with Timer() as t:
        clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(cancer_imp.transform(cancer_data['train']['inputs']))
    train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro')

    with Timer() as t:
        predicted = clf.predict(cancer_imp.transform(cancer_data['test']['inputs']))
    test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs'])
    data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs']

    t_out = cancer_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("breastcancer.dataset (kernel={})".format(kernel))
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf, data_in, data_out, title="Learning Curve: SVM (breastcancer.dataset, kernel={})".format(kernel), cv=5)
    plt.savefig('out/svm/breastcancer-kernel-{}.png'.format(kernel))
コード例 #5
0
def load_all_data(use_hw, data_set):
    # Load Data
    loader = util.load_data_set(data_set)
    data_set_name = str(data_set)
    total_x_attack, total_y_attack = loader({
        'use_hw':
        use_hw,
        'traces_path':
        '/media/rico/Data/TU/thesis/data'
    })
    total_key_guesses = np.transpose(
        util.load_csv(
            '/media/rico/Data/TU/thesis/data/{}/Value/key_guesses_ALL.csv'.
            format(data_set_name),
            delimiter=' ',
            dtype=np.int))
    real_key = util.load_csv(
        '/media/rico/Data/TU/thesis/data/{}/secret_key.csv'.format(
            data_set_name),
        dtype=np.int)
    return total_x_attack, total_y_attack, total_key_guesses, real_key
コード例 #6
0
def load_data(args):
    _x_attack, _y_attack, _real_key, _dk_plain, _key_guesses = None, None, None, None, None
    ###################
    # Load the traces #
    ###################
    loader = util.load_data_set(args.data_set)
    total_x_attack, total_y_attack, plain = loader({'use_hw': args.use_hw,
                                                    'traces_path': args.traces_path,
                                                    'raw_traces': args.raw_traces,
                                                    'start': args.train_size + args.validation_size,
                                                    'size': args.attack_size,
                                                    'domain_knowledge': True,
                                                    'use_noise_data': args.use_noise_data,
                                                    'data_set': args.data_set,
                                                    'noise_level': args.noise_level})
    if plain is not None:
        _dk_plain = torch.from_numpy(plain).cuda()
    print('Loading key guesses')

    ####################################
    # Load the key guesses and the key #
    ####################################
    data_set_name = str(args.data_set)
    _key_guesses = util.load_csv('{}/{}/Value/key_guesses_ALL_transposed.csv'.format(
        args.traces_path,
        data_set_name),
        delimiter=' ',
        dtype=np.int,
        start=args.train_size + args.validation_size,
        size=args.attack_size)
    _real_key = util.load_csv('{}/{}/secret_key.csv'.format(args.traces_path, data_set_name),
                              dtype=np.int)

    _x_attack = total_x_attack
    _y_attack = total_y_attack
    return _x_attack, _y_attack, _key_guesses, _real_key, _dk_plain
コード例 #7
0
import os
"""
preparing data
create training data 70%, testing data 30%
"""
config.N_FEATURES = len(preprocessing.get_fetures_nm_list())
PICKLE_FILE_NAME = 'DENSE'
MODEL_FILE_NAME = 'DENSE'

if os.path.isfile('{}.pickle'.format(PICKLE_FILE_NAME)):
    f = open('{}.pickle'.format(PICKLE_FILE_NAME), 'rb')
    print('loading pickle file from disk')
    l = pickle.load(f)
    X_train, X_test, y_train, y_test, m = l[0], l[1], l[2], l[3], l[4]
else:
    X_train, X_test, y_train, y_test = util.load_data_set(30, 33)
    m = preprocessing.get_sido_onehot_map()
    f = open('{}.pickle'.format(PICKLE_FILE_NAME), 'wb')
    pickle.dump([X_train, X_test, y_train, y_test, m], f)
    print('finished to dump pickle file from disk')
"""
merge two other neural networks
https://statcompute.wordpress.com/2017/01/08/an-example-of-merge-layer-in-keras/
https://nhanitvn.wordpress.com/2016/09/27/a-keras-layer-for-one-hot-encoding/
"""
# d_features = Input(shape=(1, config.N_FEATURES, config.N_TIME_WINDOW), name="features")
# d_sido = Input(shape=(len(preprocessing.get_sido_nm_list()), ), name="sido_onehot")
d_features = Input(shape=(config.N_TIME_WINDOW, config.N_FEATURES),
                   name="features")
# d_sido = Input(shape=(len(preprocessing.get_sido_nm_list()),), name="sido_onehot")
コード例 #8
0
for i in range(len(traces_indices)):
    if len(res[i]) == 1:
        index = res[i][0]
        value = traces_map.get(index)
        if value is None:
            value = []
        value.append(i)
        traces_map.update({index: value})

print("Traces map single filter")
for k, v in traces_map.items():
    print(f"{k}: {v}")

import util

loader_function = util.load_data_set(util.DataSet.RANDOM_DELAY_NORMALIZED)
traces_path = "/media/rico/Data/TU/thesis/data/"
x_attack, _, _ = loader_function({
    'use_hw':
    False,
    'traces_path':
    traces_path,
    'raw_traces':
    False,
    'start':
    40000 + 1000,
    'size':
    5000,
    'domain_knowledge':
    True,
    'use_noise_data':
コード例 #9
0
import random

import numpy as np

import k_means_clustering
import util

# Parameter
data_set_location = "datasets/Compound.csv"
total_class = 6
K = total_class

if __name__ == "__main__":

    # Load dataset
    data, labels = util.load_data_set(data_set_location, label_separated=True)
    data_set = (np.array(data), np.array(labels))

    # Visualization with no color
    util.visualize(data_set)

    # initialization cluster
    cluster = k_means_clustering.KMeansCluster(K, data_set[0])

    # initialization cluster by random point on each class
    data_separated, total_class = util.separate_data_by_class(data_set)
    class_centroid = []
    for label, data in data_separated.items():
        max_index = len(data[0])
        random_centroid = random.randint(0, max_index - 1)
        class_centroid.append(
コード例 #10
0
def load_data(args, network_name):
    _x_attack, _y_attack, _real_key, _dk_plain, _key_guesses = None, None, None, None, None
    argz = {
        'use_hw': args.use_hw,
        'traces_path': args.traces_path,
        'raw_traces': args.raw_traces,
        'start': args.train_size + args.validation_size,
        'size': args.attack_size,
        'train_size': args.train_size,
        'validation_size': args.validation_size,
        'domain_knowledge': True,
        'use_noise_data': args.use_noise_data,
        'data_set': args.data_set,
        'sub_key_index': args.subkey_index,
        'desync': args.desync,
        'unmask': args.unmask
    }

    if args.data_set == util.DataSet.ASCAD:
        _x_attack, _y_attack, _plain, _real_key, _key_guesses = util.load_ascad_test_traces(
            argz)
    elif args.data_set == util.DataSet.ASCAD_NORMALIZED:
        _x_attack, _y_attack, _key_guesses, _real_key = util.load_ascad_normalized_test_traces(
            argz)
    elif args.data_set == util.DataSet.SIM_MASK:
        _x_attack, _y_attack, _key_guesses, _real_key = util.load_sim_mask_test_traces(
            argz)
    elif args.data_set == util.DataSet.ASCAD_KEYS or args.data_set == util.DataSet.ASCAD_KEYS_NORMALIZED:
        _x_attack, _y_attack, _key_guesses, _real_key, _dk_plain = util.load_ascad_keys_test(
            argz)
    elif args.data_set == util.DataSet.RANDOM_DELAY_LARGE:
        ###################
        # Load the traces #
        ###################
        loader = util.load_data_set(args.data_set)
        total_x_attack, total_y_attack, plain = loader({
            'use_hw':
            args.use_hw,
            'traces_path':
            args.traces_path,
            'raw_traces':
            args.raw_traces,
            'start':
            args.train_size + args.validation_size,
            'size':
            args.attack_size,
            'domain_knowledge':
            True,
            'use_noise_data':
            args.use_noise_data,
            'data_set':
            args.data_set
        })
        print('Loading key guesses')

        ####################################
        # Load the key guesses and the key #
        ####################################
        data_set_name = str(args.data_set)
        _key_guesses = util.load_random_delay_large_key_guesses(
            args.traces_path, args.train_size + args.validation_size,
            args.attack_size)
        _real_key = util.load_csv('{}/{}/secret_key.csv'.format(
            args.traces_path, data_set_name),
                                  dtype=np.int)

        _x_attack = total_x_attack
        _y_attack = total_y_attack

    else:
        ###################
        # Load the traces #
        ###################
        loader = util.load_data_set(args.data_set)
        total_x_attack, total_y_attack, plain = loader({
            'use_hw':
            args.use_hw,
            'traces_path':
            args.traces_path,
            'raw_traces':
            args.raw_traces,
            'start':
            args.train_size + args.validation_size,
            'size':
            args.attack_size,
            'domain_knowledge':
            True,
            'use_noise_data':
            args.use_noise_data,
            'data_set':
            args.data_set,
            'noise_level':
            args.noise_level
        })
        if plain is not None:
            _dk_plain = torch.from_numpy(plain).cuda()
        print('Loading key guesses')

        ####################################
        # Load the key guesses and the key #
        ####################################
        data_set_name = str(args.data_set)
        _key_guesses = util.load_csv(
            '{}/{}/Value/key_guesses_ALL_transposed.csv'.format(
                args.traces_path, data_set_name),
            delimiter=' ',
            dtype=np.int,
            start=args.train_size + args.validation_size,
            size=args.attack_size)
        _real_key = util.load_csv('{}/{}/secret_key.csv'.format(
            args.traces_path, data_set_name),
                                  dtype=np.int)

        _x_attack = total_x_attack
        _y_attack = total_y_attack

    return _x_attack, _y_attack, _key_guesses, _real_key, _dk_plain
コード例 #11
0
from sys import argv

import hierarchy_cluster
import util

data_set_location = "dataset/Hierarchical_2.csv"

if __name__ == "__main__":

    # Load dataset
    data = util.load_data_set(data_set_location)

    # Visualization with no color
    util.visualize(data)

    # Argument Disimmiliarity method type clustering from CLI
    # ex : python3 main 1
    # 1 for single link
    # 2 for complete link
    # 3 for group average
    # 4 for centroid based
    try:
        type = int(argv[1])
    except:
        type = 1  # default 1 for no argument
    hierarchy_cluster.agglomerative_clustering(data, type=type)
コード例 #12
0
import util
import numpy as np
import subprocess

data_set = util.DataSet.RANDOM_DELAY
data_loader = util.load_data_set(data_set)

files = []
step = 2000
for i in range(0, 50000, step):
    print(i)
    args = {
        "raw_traces": True,
        "start": i,
        "size": step,
        "traces_path": "/media/rico/Data/TU/thesis/data/",
        "use_hw": False
    }

    path_rd = '{}/Random_Delay/traces/'.format(args['traces_path'])

    x_train = util.load_csv(
        '{}/Random_Delay/traces/traces_complete.csv'.format(
            args['traces_path']),
        delimiter=' ',
        start=args.get('start'),
        size=args.get('size'))
    mean = np.mean(x_train, axis=0)

    noise = np.random.normal(0, 7, 3500 * args['size']).reshape(
        (args['size'], 3500))
コード例 #13
0
ファイル: inference.py プロジェクト: freeskyES/help-Challenge
def inference():
    test_model = pickle.load(open(os.path.join(VOL_DIR, 'model.dat'), 'rb'))
    person_table, condition_occurrence_table, outcome_cohort_table, measurement_table = util.load_data_set(
        TEST_DIR)
    measurement_table = util.preprocess_measurement(measurement_table)
    y_pred, y_proba = util.predict(test_model, person_table,
                                   condition_occurrence_table,
                                   measurement_table, outcome_cohort_table)
    predict_result = pd.DataFrame({
        'LABEL': y_pred,
        'LABEL_PROBABILITY': y_proba
    })
    predict_result.to_csv(os.path.join(OUTPUT_DIR, 'output.csv'), index=False)
コード例 #14
0
import math
import util

data_set = "datasets/D31.csv"
label = util.load_data_set(data_set, label_separated=True)[1]

# Initialization data MLP Neural Network
feature_dim = 2
output_layer = len(set(label))
# hidden_layer = round(math.sqrt(feature_dim * output_layer)) # Takes so long when training data.
hidden_layer = len(set(label))  # more faster in training phase
learning_rate = 0.01
# learning_rate = 0.001    # 87%
コード例 #15
0
def run(args):

    # Save the models to this folder
    dir_name = generate_folder_name(args)

    # Arguments for loading data
    load_args = {"unmask": args.unmask,
                 "use_hw": args.use_hw,
                 "traces_path": args.traces_path,
                 "sub_key_index": args.subkey_index,
                 "raw_traces": args.raw_traces,
                 "size": args.train_size + args.validation_size,
                 "train_size": args.train_size,
                 "validation_size": args.validation_size,
                 "domain_knowledge": True,
                 "desync": args.desync,
                 "use_noise_data": args.use_noise_data,
                 "start": 0,
                 "data_set": args.data_set}

    # Load data and chop into the desired sizes
    load_function = load_data_set(args.data_set)
    print(load_args)
    x_train, y_train, plain = load_function(load_args)
    x_validation = x_train[args.train_size:args.train_size + args.validation_size]
    y_validation = y_train[args.train_size:args.train_size + args.validation_size]
    x_train = x_train[0:args.train_size]
    y_train = y_train[0:args.train_size]
    p_train = None
    p_validation = None
    if plain is not None:
        p_train = plain[0:args.train_size]
        p_validation = plain[args.train_size:args.train_size + args.validation_size]

    print('Shape x: {}'.format(np.shape(x_train)))

    # Arguments for initializing the model
    init_args = {"sf": args.spread_factor,
                 "input_shape": args.input_shape,
                 "n_classes": 9 if args.use_hw else 256,
                 "kernel_size": args.kernel_size,
                 "channel_size": args.channel_size,
                 "num_layers": args.num_layers,
                 "max_pool": args.max_pool
                 }

    # Do the runs
    for i in range(args.runs):
        # Initialize the network and the weights
        network = args.init(init_args)
        init_weights(network, args.init_weights)

        # Filename of the model + the folder
        filename = 'model_r{}_{}'.format(i, network.name())
        model_save_file = '{}/{}/{}.pt'.format(args.model_save_path, dir_name, filename)

        print('Training with learning rate: {}, desync {}'.format(args.lr, args.desync))

        if args.domain_knowledge:
            network, res = train_dk2(x_train, y_train, p_train,
                                     train_size=args.train_size,
                                     x_validation=x_validation,
                                     y_validation=y_validation,
                                     p_validation=p_validation,
                                     validation_size=args.validation_size,
                                     network=network,
                                     epochs=args.epochs,
                                     batch_size=args.batch_size,
                                     lr=args.lr,
                                     checkpoints=args.checkpoints,
                                     save_path=model_save_file,
                                     loss_function=args.loss_function,
                                     l2_penalty=args.l2_penalty,
                                     )
        else:
            network, res = train(x_train, y_train,
                                 train_size=args.train_size,
                                 x_validation=x_validation,
                                 y_validation=y_validation,
                                 validation_size=args.validation_size,
                                 network=network,
                                 epochs=args.epochs,
                                 batch_size=args.batch_size,
                                 lr=args.lr,
                                 checkpoints=args.checkpoints,
                                 save_path=model_save_file,
                                 loss_function=args.loss_function,
                                 l2_penalty=args.l2_penalty,
                                 optimizer=args.optimizer
                                 )
        # Save the results of the accuracy and loss during training
        save_loss_acc(model_save_file, filename, res)

        # Make sure don't mess with our min/max of the spread network
        if isinstance(network, SpreadNet):
            network.training = False

        # Save the final model
        save_model(network, model_save_file)
コード例 #16
0
import math

import naive_bayes
import util

data_set = util.load_data_set('datasets/Compound.csv')

if __name__ == "__main__":

    # compare & visualize between dataset and evaluated data
    util.visualize(data_set)

    # Make Naive Bayes Classifier (Gaussian)
    classifier = naive_bayes.NaiveBayes(data_set)

    # Evaluate date set & make confusion matrix
    evaluated_data, confusion_matrix = classifier.evaluate(data_set)

    # compare & visualize between dataset and evaluated data
    util.compare_data(data_set, evaluated_data)

    # performance calculation (accuracy)
    # print "Accuracy : {}".format(util.performance_calculation(confusion_matrix))

    # performance calculation (f1_score)
    util.performance_calculation(confusion_matrix, mode="f1_micro_average")

    #
    util.decision_boundary(data_set, classifier)
コード例 #17
0
def decision_tree_pruning_car():
    car_data = load_data_set('car')
    car_ohe = preprocessing.OneHotEncoder()
    car_ohe.fit(car_data['train']['inputs'] + car_data['test']['inputs'])  # encode features as one-hot

    clf = tree.DecisionTreeClassifier(
        criterion="gini",
        splitter="random",
    )

    with Timer() as t:
        clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(car_ohe.transform(car_data['train']['inputs']))
    train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro')

    with Timer() as t:
        predicted = clf.predict(car_ohe.transform(car_data['test']['inputs']))
    test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs'])
    data_out = car_data['train']['outputs'] + car_data['test']['outputs']

    t_out = car_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("car.dataset (no pruning)")
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf, data_in, data_out, title="Learning Curve: Decision Trees (car.dataset, no pruning)", cv=5)
    plt.savefig('out/decision_tree_pruning/car-noprune-learning.png')
    export_decision_tree(clf, 'car-noprune')

    clf = tree.DecisionTreeClassifier(
        criterion="gini",
        splitter="random",
        min_samples_leaf=5,  # minimum of 5 samples at leaf nodes
        max_depth=9
    )

    with Timer() as t:
        clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(car_ohe.transform(car_data['train']['inputs']))
    train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro')

    with Timer() as t:
        predicted = clf.predict(car_ohe.transform(car_data['test']['inputs']))
    test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs'])
    data_out = car_data['train']['outputs'] + car_data['test']['outputs']

    t_out = car_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("car.dataset (pruned)")
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()
    skplt.estimators.plot_learning_curve(
        clf, data_in, data_out, title="Learning Curve: Decision Trees (car.dataset, pruned)", cv=5)
    plt.savefig('out/decision_tree_pruning/car-prune-learning.png')
    export_decision_tree(clf, 'car-prune')
コード例 #18
0
import numpy as np
import os.path

import params
import util
from neural_network import MLPNeuralNetwork
from matplotlib import pyplot as plt

# Load data set & Normalization
data, labels = util.load_data_set(params.data_set, label_separated=True)
data, labels = util.normalization(np.array(data)), np.array(labels)
data_set = (data, labels)

# Make classifier
classifier = MLPNeuralNetwork(params.hidden_layer, params.output_layer,
                              params.feature_dim, params.learning_rate)

# Load weight data if exist
if os.path.exists("training_data/W1.npy") and os.path.exists("training_data/W2.npy") \
        and os.path.exists("training_data/B1.npy") and os.path.exists("training_data/B2.npy"):
    classifier.W1 = np.load("training_data/W1.npy")
    classifier.W2 = np.load("training_data/W2.npy")
    classifier.B1 = np.load("training_data/B1.npy")
    classifier.B2 = np.load("training_data/B2.npy")

# Training classifier
minimum_error = 0.2
error = 100.0
acc = 0.0
if os.path.exists("training_data/accuracy_visual.npy") and os.path.exists("training_data/mse_visual.npy") \
    and os.path.exists("training_data/epoch.npy"):
コード例 #19
0
def decision_tree_pruning_cancer():
    cancer_data = load_data_set('breastcancer')
    cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
    cancer_imp.fit(np.array(cancer_data['train']['inputs'] + cancer_data['test']['inputs'], dtype=np.float32))

    clf = tree.DecisionTreeClassifier(
        criterion="gini",
        splitter="random"
    )

    with Timer() as t:
        clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(cancer_imp.transform(cancer_data['train']['inputs']))
    train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro')

    with Timer() as t:
        predicted = clf.predict(cancer_imp.transform(cancer_data['test']['inputs']))
    test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs'])
    data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs']

    t_out = cancer_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("breastcancer.dataset (no pruning)")
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf, data_in, data_out, title="Learning Curve: Decision Trees (breastcancer.dataset, no pruning)", cv=5)
    plt.savefig('out/decision_tree_pruning/breastcancer-noprune-learning.png')
    export_decision_tree(clf, 'breastcancer-noprune')

    clf = tree.DecisionTreeClassifier(
        criterion="gini",
        splitter="random",
        min_samples_leaf=10,  # minimum of 10 samples at leaf nodes
        max_depth=5
    )

    with Timer() as t:
        clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(cancer_imp.transform(cancer_data['train']['inputs']))
    train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro')

    with Timer() as t:
        predicted = clf.predict(cancer_imp.transform(cancer_data['test']['inputs']))
    test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs'])
    data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs']

    t_out = cancer_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("breastcancer.dataset (pruned)")
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf, data_in, data_out, title="Learning Curve: Decision Trees (breastcancer.dataset, pruned)", cv=5)
    plt.savefig('out/decision_tree_pruning/breastcancer-prune-learning.png')
    export_decision_tree(clf, 'breastcancer-prune')
コード例 #20
0
def knn_cancer(k_value=1):
    cancer_data = load_data_set('breastcancer')
    cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
    cancer_imp.fit(
        np.array(cancer_data['train']['inputs'] +
                 cancer_data['test']['inputs'],
                 dtype=np.float32))

    x = list()
    y_train = list()
    y_test = list()
    y_cross = list()

    # chart different k-values vs. f1 score first
    for i in range(30):
        _k = i + 1
        clf = KNeighborsClassifier(n_neighbors=_k)
        clf.fit(cancer_imp.transform(cancer_data['train']['inputs']),
                cancer_data['train']['outputs'])
        predicted = clf.predict(
            cancer_imp.transform(cancer_data['train']['inputs']))
        train_f1_score = metrics.f1_score(cancer_data['train']['outputs'],
                                          predicted,
                                          average='micro')
        predicted = clf.predict(
            cancer_imp.transform(cancer_data['test']['inputs']))
        test_f1_score = metrics.f1_score(cancer_data['test']['outputs'],
                                         predicted,
                                         average='micro')

        data_in = cancer_imp.transform(cancer_data['train']['inputs'] +
                                       cancer_data['test']['inputs'])
        data_out = cancer_data['train']['outputs'] + cancer_data['test'][
            'outputs']
        cross_val = cross_val_score(clf, data_in, data_out, cv=5)

        x.append(_k)
        y_train.append(train_f1_score)
        y_test.append(test_f1_score)
        y_cross.append(np.mean(cross_val))

    plt.figure()
    plt.title('Scores for various k (breastcancer.dataset)')
    plt.xlabel('k value')
    plt.ylabel('Score')
    plt.plot(x, y_train, label='Training F1 score')
    plt.plot(x, y_test, label='Testing F1 score')
    plt.plot(x, y_cross, label='Cross-validation score')
    plt.legend()
    plt.savefig('out/knn/breastcancer-k-testing.png')

    # chart with given k-value for detail
    clf = KNeighborsClassifier(n_neighbors=k_value)

    with Timer() as t:
        clf.fit(cancer_imp.transform(cancer_data['train']['inputs']),
                cancer_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(
        cancer_imp.transform(cancer_data['train']['inputs']))
    train_f1_score = metrics.f1_score(cancer_data['train']['outputs'],
                                      predicted,
                                      average='micro')

    with Timer() as t:
        predicted = clf.predict(
            cancer_imp.transform(cancer_data['test']['inputs']))
    test_f1_score = metrics.f1_score(cancer_data['test']['outputs'],
                                     predicted,
                                     average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = cancer_imp.transform(cancer_data['train']['inputs'] +
                                   cancer_data['test']['inputs'])
    data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs']

    t_out = cancer_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("breastcancer.dataset (k={})".format(k_value))
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf,
        data_in,
        data_out,
        title="Learning Curve: kNN (breastcancer.dataset, k={})".format(
            k_value),
        cv=5)
    plt.savefig('out/knn/breastcancer-k-{}.png'.format(k_value))
コード例 #21
0
def knn_car(k_value=1):
    car_data = load_data_set('car')
    car_ohe = preprocessing.OneHotEncoder()
    car_ohe.fit(car_data['train']['inputs'] +
                car_data['test']['inputs'])  # encode features as one-hot

    x = list()
    y_train = list()
    y_test = list()
    y_cross = list()

    # chart different k-values vs. f1 score first
    for i in range(30):
        _k = i + 1
        clf = KNeighborsClassifier(n_neighbors=_k)
        clf.fit(car_ohe.transform(car_data['train']['inputs']),
                car_data['train']['outputs'])
        predicted = clf.predict(car_ohe.transform(car_data['train']['inputs']))
        train_f1_score = metrics.f1_score(car_data['train']['outputs'],
                                          predicted,
                                          average='micro')
        predicted = clf.predict(car_ohe.transform(car_data['test']['inputs']))
        test_f1_score = metrics.f1_score(car_data['test']['outputs'],
                                         predicted,
                                         average='micro')

        data_in = car_ohe.transform(car_data['train']['inputs'] +
                                    car_data['test']['inputs'])
        data_out = car_data['train']['outputs'] + car_data['test']['outputs']
        cross_val = cross_val_score(clf, data_in, data_out, cv=5)

        x.append(_k)
        y_train.append(train_f1_score)
        y_test.append(test_f1_score)
        y_cross.append(np.mean(cross_val))

    plt.figure()
    plt.title('Scores for various k (car.dataset)')
    plt.xlabel('k value')
    plt.ylabel('Score')
    plt.plot(x, y_train, label='Training F1 score')
    plt.plot(x, y_test, label='Testing F1 score')
    plt.plot(x, y_cross, label='Cross-validation score')
    plt.legend()
    plt.savefig('out/knn/car-k-testing.png')

    clf = KNeighborsClassifier(n_neighbors=k_value)

    with Timer() as t:
        clf.fit(car_ohe.transform(car_data['train']['inputs']),
                car_data['train']['outputs'])

    time_to_fit = t.interval * 1000

    predicted = clf.predict(car_ohe.transform(car_data['train']['inputs']))
    train_f1_score = metrics.f1_score(car_data['train']['outputs'],
                                      predicted,
                                      average='micro')

    with Timer() as t:
        predicted = clf.predict(car_ohe.transform(car_data['test']['inputs']))
    test_f1_score = metrics.f1_score(car_data['test']['outputs'],
                                     predicted,
                                     average='micro')

    test_prediction_runtime = t.interval * 1000

    data_in = car_ohe.transform(car_data['train']['inputs'] +
                                car_data['test']['inputs'])
    data_out = car_data['train']['outputs'] + car_data['test']['outputs']

    t_out = car_data['test']['outputs']

    accuracy = accuracy_score(t_out, predicted) * 100
    precision = precision_score(t_out, predicted, average="weighted") * 100

    print("car.dataset (k={})".format(k_value))
    print("training f1 score:", train_f1_score)
    print("test f1 score:", test_f1_score)
    print("time to fit:", time_to_fit)
    print("test prediction runtime:", test_prediction_runtime)
    print("test accuracy", accuracy)
    print("test precision", precision)
    print()

    skplt.estimators.plot_learning_curve(
        clf,
        data_in,
        data_out,
        title="Learning Curve: kNN (car.dataset, k={})".format(k_value),
        cv=5)
    plt.savefig('out/knn/car-k-{}.png'.format(k_value))
コード例 #22
0
import tensorflow as tf
from cnn import Cnn
import config
import util

x_train_orig, y_train_orig, x_test_orig, y_test_orig, classes = util.load_data_set(
)
x_train = util.pre_treat(x_train_orig)
x_test = util.pre_treat(x_test_orig)
y_train = util.pre_treat(y_train_orig, is_x=False, class_num=len(classes))
y_test = util.pre_treat(y_test_orig, is_x=False, class_num=len(classes))

cnn = Cnn(config.conv_layers, config.fc_layers, config.filters,
          config.learning_rate, config.beta1, config.beta2)

(m, n_H0, n_W0, n_C0) = x_train.shape
n_y = y_train.shape[1]

# construction calculation graph
cnn.initialize(n_H0, n_W0, n_C0, n_y)
cnn.forward()
cost = cnn.cost()
optimizer = cnn.get_optimizer(cost)
predict, accuracy = cnn.predict()

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for i in range(1, config.num_epochs + 1):
コード例 #23
0
def train():
  test_model = ensemble.GradientBoostingClassifier()
  person_table, condition_occurrence_table, outcome_cohort_table, measurement_table = util.load_data_set(TRAIN_DIR)
  measurement_table = util.preprocess_measurement(measurement_table)
  test_model = util.train_model(test_model,person_table, condition_occurrence_table, measurement_table, outcome_cohort_table)
  pickle.dump(test_model, open(os.path.join(VOL_DIR,'model.dat'),'wb')) # 데이터 입력