def neural_net_cancer(solver): cancer_data = load_data_set('breastcancer') cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean') cancer_imp.fit( np.array(cancer_data['train']['inputs'] + cancer_data['test']['inputs'], dtype=np.float32)) clf = neural_network.MLPClassifier(solver=solver, warm_start=True, max_iter=1000) with Timer() as t: clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict( cancer_imp.transform(cancer_data['train']['inputs'])) train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict( cancer_imp.transform(cancer_data['test']['inputs'])) test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs']) data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs'] t_out = cancer_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("breastcancer.dataset (solver={})".format(solver)) print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: Neural Net (breastcancer.dataset, solver={})". format(solver), cv=5) plt.savefig('out/neural_net/breastcancer-solver-{}.png'.format(solver))
def neural_net_car(solver): car_data = load_data_set('car') car_ohe = preprocessing.OneHotEncoder() car_ohe.fit(car_data['train']['inputs'] + car_data['test']['inputs']) # encode features as one-hot clf = neural_network.MLPClassifier(solver=solver, warm_start=True, max_iter=1000) with Timer() as t: clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict(car_ohe.transform(car_data['train']['inputs'])) train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict(car_ohe.transform(car_data['test']['inputs'])) test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs']) data_out = car_data['train']['outputs'] + car_data['test']['outputs'] t_out = car_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("car.dataset (solver={})".format(solver)) print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: Neural Net (car.dataset, solver={})".format( solver), cv=5) plt.savefig('out/neural_net/car-solver-{}.png'.format(solver))
def svm_car(kernel="linear"): car_data = load_data_set('car') car_ohe = preprocessing.OneHotEncoder() car_ohe.fit(car_data['train']['inputs'] + car_data['test']['inputs']) # encode features as one-hot clf = svm.SVC( kernel=kernel ) with Timer() as t: clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict(car_ohe.transform(car_data['train']['inputs'])) train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict(car_ohe.transform(car_data['test']['inputs'])) test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs']) data_out = car_data['train']['outputs'] + car_data['test']['outputs'] t_out = car_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("car.dataset (kernel={})".format(kernel)) print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: SVM (car.dataset, kernel={})".format(kernel), cv=5) plt.savefig('out/svm/car-kernel-{}.png'.format(kernel))
def svm_cancer(kernel="rbf"): cancer_data = load_data_set('breastcancer') cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean') cancer_imp.fit(np.array(cancer_data['train']['inputs'] + cancer_data['test']['inputs'], dtype=np.float32)) clf = svm.SVC( kernel=kernel ) with Timer() as t: clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict(cancer_imp.transform(cancer_data['train']['inputs'])) train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict(cancer_imp.transform(cancer_data['test']['inputs'])) test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs']) data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs'] t_out = cancer_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("breastcancer.dataset (kernel={})".format(kernel)) print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: SVM (breastcancer.dataset, kernel={})".format(kernel), cv=5) plt.savefig('out/svm/breastcancer-kernel-{}.png'.format(kernel))
def load_all_data(use_hw, data_set): # Load Data loader = util.load_data_set(data_set) data_set_name = str(data_set) total_x_attack, total_y_attack = loader({ 'use_hw': use_hw, 'traces_path': '/media/rico/Data/TU/thesis/data' }) total_key_guesses = np.transpose( util.load_csv( '/media/rico/Data/TU/thesis/data/{}/Value/key_guesses_ALL.csv'. format(data_set_name), delimiter=' ', dtype=np.int)) real_key = util.load_csv( '/media/rico/Data/TU/thesis/data/{}/secret_key.csv'.format( data_set_name), dtype=np.int) return total_x_attack, total_y_attack, total_key_guesses, real_key
def load_data(args): _x_attack, _y_attack, _real_key, _dk_plain, _key_guesses = None, None, None, None, None ################### # Load the traces # ################### loader = util.load_data_set(args.data_set) total_x_attack, total_y_attack, plain = loader({'use_hw': args.use_hw, 'traces_path': args.traces_path, 'raw_traces': args.raw_traces, 'start': args.train_size + args.validation_size, 'size': args.attack_size, 'domain_knowledge': True, 'use_noise_data': args.use_noise_data, 'data_set': args.data_set, 'noise_level': args.noise_level}) if plain is not None: _dk_plain = torch.from_numpy(plain).cuda() print('Loading key guesses') #################################### # Load the key guesses and the key # #################################### data_set_name = str(args.data_set) _key_guesses = util.load_csv('{}/{}/Value/key_guesses_ALL_transposed.csv'.format( args.traces_path, data_set_name), delimiter=' ', dtype=np.int, start=args.train_size + args.validation_size, size=args.attack_size) _real_key = util.load_csv('{}/{}/secret_key.csv'.format(args.traces_path, data_set_name), dtype=np.int) _x_attack = total_x_attack _y_attack = total_y_attack return _x_attack, _y_attack, _key_guesses, _real_key, _dk_plain
import os """ preparing data create training data 70%, testing data 30% """ config.N_FEATURES = len(preprocessing.get_fetures_nm_list()) PICKLE_FILE_NAME = 'DENSE' MODEL_FILE_NAME = 'DENSE' if os.path.isfile('{}.pickle'.format(PICKLE_FILE_NAME)): f = open('{}.pickle'.format(PICKLE_FILE_NAME), 'rb') print('loading pickle file from disk') l = pickle.load(f) X_train, X_test, y_train, y_test, m = l[0], l[1], l[2], l[3], l[4] else: X_train, X_test, y_train, y_test = util.load_data_set(30, 33) m = preprocessing.get_sido_onehot_map() f = open('{}.pickle'.format(PICKLE_FILE_NAME), 'wb') pickle.dump([X_train, X_test, y_train, y_test, m], f) print('finished to dump pickle file from disk') """ merge two other neural networks https://statcompute.wordpress.com/2017/01/08/an-example-of-merge-layer-in-keras/ https://nhanitvn.wordpress.com/2016/09/27/a-keras-layer-for-one-hot-encoding/ """ # d_features = Input(shape=(1, config.N_FEATURES, config.N_TIME_WINDOW), name="features") # d_sido = Input(shape=(len(preprocessing.get_sido_nm_list()), ), name="sido_onehot") d_features = Input(shape=(config.N_TIME_WINDOW, config.N_FEATURES), name="features") # d_sido = Input(shape=(len(preprocessing.get_sido_nm_list()),), name="sido_onehot")
for i in range(len(traces_indices)): if len(res[i]) == 1: index = res[i][0] value = traces_map.get(index) if value is None: value = [] value.append(i) traces_map.update({index: value}) print("Traces map single filter") for k, v in traces_map.items(): print(f"{k}: {v}") import util loader_function = util.load_data_set(util.DataSet.RANDOM_DELAY_NORMALIZED) traces_path = "/media/rico/Data/TU/thesis/data/" x_attack, _, _ = loader_function({ 'use_hw': False, 'traces_path': traces_path, 'raw_traces': False, 'start': 40000 + 1000, 'size': 5000, 'domain_knowledge': True, 'use_noise_data':
import random import numpy as np import k_means_clustering import util # Parameter data_set_location = "datasets/Compound.csv" total_class = 6 K = total_class if __name__ == "__main__": # Load dataset data, labels = util.load_data_set(data_set_location, label_separated=True) data_set = (np.array(data), np.array(labels)) # Visualization with no color util.visualize(data_set) # initialization cluster cluster = k_means_clustering.KMeansCluster(K, data_set[0]) # initialization cluster by random point on each class data_separated, total_class = util.separate_data_by_class(data_set) class_centroid = [] for label, data in data_separated.items(): max_index = len(data[0]) random_centroid = random.randint(0, max_index - 1) class_centroid.append(
def load_data(args, network_name): _x_attack, _y_attack, _real_key, _dk_plain, _key_guesses = None, None, None, None, None argz = { 'use_hw': args.use_hw, 'traces_path': args.traces_path, 'raw_traces': args.raw_traces, 'start': args.train_size + args.validation_size, 'size': args.attack_size, 'train_size': args.train_size, 'validation_size': args.validation_size, 'domain_knowledge': True, 'use_noise_data': args.use_noise_data, 'data_set': args.data_set, 'sub_key_index': args.subkey_index, 'desync': args.desync, 'unmask': args.unmask } if args.data_set == util.DataSet.ASCAD: _x_attack, _y_attack, _plain, _real_key, _key_guesses = util.load_ascad_test_traces( argz) elif args.data_set == util.DataSet.ASCAD_NORMALIZED: _x_attack, _y_attack, _key_guesses, _real_key = util.load_ascad_normalized_test_traces( argz) elif args.data_set == util.DataSet.SIM_MASK: _x_attack, _y_attack, _key_guesses, _real_key = util.load_sim_mask_test_traces( argz) elif args.data_set == util.DataSet.ASCAD_KEYS or args.data_set == util.DataSet.ASCAD_KEYS_NORMALIZED: _x_attack, _y_attack, _key_guesses, _real_key, _dk_plain = util.load_ascad_keys_test( argz) elif args.data_set == util.DataSet.RANDOM_DELAY_LARGE: ################### # Load the traces # ################### loader = util.load_data_set(args.data_set) total_x_attack, total_y_attack, plain = loader({ 'use_hw': args.use_hw, 'traces_path': args.traces_path, 'raw_traces': args.raw_traces, 'start': args.train_size + args.validation_size, 'size': args.attack_size, 'domain_knowledge': True, 'use_noise_data': args.use_noise_data, 'data_set': args.data_set }) print('Loading key guesses') #################################### # Load the key guesses and the key # #################################### data_set_name = str(args.data_set) _key_guesses = util.load_random_delay_large_key_guesses( args.traces_path, args.train_size + args.validation_size, args.attack_size) _real_key = util.load_csv('{}/{}/secret_key.csv'.format( args.traces_path, data_set_name), dtype=np.int) _x_attack = total_x_attack _y_attack = total_y_attack else: ################### # Load the traces # ################### loader = util.load_data_set(args.data_set) total_x_attack, total_y_attack, plain = loader({ 'use_hw': args.use_hw, 'traces_path': args.traces_path, 'raw_traces': args.raw_traces, 'start': args.train_size + args.validation_size, 'size': args.attack_size, 'domain_knowledge': True, 'use_noise_data': args.use_noise_data, 'data_set': args.data_set, 'noise_level': args.noise_level }) if plain is not None: _dk_plain = torch.from_numpy(plain).cuda() print('Loading key guesses') #################################### # Load the key guesses and the key # #################################### data_set_name = str(args.data_set) _key_guesses = util.load_csv( '{}/{}/Value/key_guesses_ALL_transposed.csv'.format( args.traces_path, data_set_name), delimiter=' ', dtype=np.int, start=args.train_size + args.validation_size, size=args.attack_size) _real_key = util.load_csv('{}/{}/secret_key.csv'.format( args.traces_path, data_set_name), dtype=np.int) _x_attack = total_x_attack _y_attack = total_y_attack return _x_attack, _y_attack, _key_guesses, _real_key, _dk_plain
from sys import argv import hierarchy_cluster import util data_set_location = "dataset/Hierarchical_2.csv" if __name__ == "__main__": # Load dataset data = util.load_data_set(data_set_location) # Visualization with no color util.visualize(data) # Argument Disimmiliarity method type clustering from CLI # ex : python3 main 1 # 1 for single link # 2 for complete link # 3 for group average # 4 for centroid based try: type = int(argv[1]) except: type = 1 # default 1 for no argument hierarchy_cluster.agglomerative_clustering(data, type=type)
import util import numpy as np import subprocess data_set = util.DataSet.RANDOM_DELAY data_loader = util.load_data_set(data_set) files = [] step = 2000 for i in range(0, 50000, step): print(i) args = { "raw_traces": True, "start": i, "size": step, "traces_path": "/media/rico/Data/TU/thesis/data/", "use_hw": False } path_rd = '{}/Random_Delay/traces/'.format(args['traces_path']) x_train = util.load_csv( '{}/Random_Delay/traces/traces_complete.csv'.format( args['traces_path']), delimiter=' ', start=args.get('start'), size=args.get('size')) mean = np.mean(x_train, axis=0) noise = np.random.normal(0, 7, 3500 * args['size']).reshape( (args['size'], 3500))
def inference(): test_model = pickle.load(open(os.path.join(VOL_DIR, 'model.dat'), 'rb')) person_table, condition_occurrence_table, outcome_cohort_table, measurement_table = util.load_data_set( TEST_DIR) measurement_table = util.preprocess_measurement(measurement_table) y_pred, y_proba = util.predict(test_model, person_table, condition_occurrence_table, measurement_table, outcome_cohort_table) predict_result = pd.DataFrame({ 'LABEL': y_pred, 'LABEL_PROBABILITY': y_proba }) predict_result.to_csv(os.path.join(OUTPUT_DIR, 'output.csv'), index=False)
import math import util data_set = "datasets/D31.csv" label = util.load_data_set(data_set, label_separated=True)[1] # Initialization data MLP Neural Network feature_dim = 2 output_layer = len(set(label)) # hidden_layer = round(math.sqrt(feature_dim * output_layer)) # Takes so long when training data. hidden_layer = len(set(label)) # more faster in training phase learning_rate = 0.01 # learning_rate = 0.001 # 87%
def run(args): # Save the models to this folder dir_name = generate_folder_name(args) # Arguments for loading data load_args = {"unmask": args.unmask, "use_hw": args.use_hw, "traces_path": args.traces_path, "sub_key_index": args.subkey_index, "raw_traces": args.raw_traces, "size": args.train_size + args.validation_size, "train_size": args.train_size, "validation_size": args.validation_size, "domain_knowledge": True, "desync": args.desync, "use_noise_data": args.use_noise_data, "start": 0, "data_set": args.data_set} # Load data and chop into the desired sizes load_function = load_data_set(args.data_set) print(load_args) x_train, y_train, plain = load_function(load_args) x_validation = x_train[args.train_size:args.train_size + args.validation_size] y_validation = y_train[args.train_size:args.train_size + args.validation_size] x_train = x_train[0:args.train_size] y_train = y_train[0:args.train_size] p_train = None p_validation = None if plain is not None: p_train = plain[0:args.train_size] p_validation = plain[args.train_size:args.train_size + args.validation_size] print('Shape x: {}'.format(np.shape(x_train))) # Arguments for initializing the model init_args = {"sf": args.spread_factor, "input_shape": args.input_shape, "n_classes": 9 if args.use_hw else 256, "kernel_size": args.kernel_size, "channel_size": args.channel_size, "num_layers": args.num_layers, "max_pool": args.max_pool } # Do the runs for i in range(args.runs): # Initialize the network and the weights network = args.init(init_args) init_weights(network, args.init_weights) # Filename of the model + the folder filename = 'model_r{}_{}'.format(i, network.name()) model_save_file = '{}/{}/{}.pt'.format(args.model_save_path, dir_name, filename) print('Training with learning rate: {}, desync {}'.format(args.lr, args.desync)) if args.domain_knowledge: network, res = train_dk2(x_train, y_train, p_train, train_size=args.train_size, x_validation=x_validation, y_validation=y_validation, p_validation=p_validation, validation_size=args.validation_size, network=network, epochs=args.epochs, batch_size=args.batch_size, lr=args.lr, checkpoints=args.checkpoints, save_path=model_save_file, loss_function=args.loss_function, l2_penalty=args.l2_penalty, ) else: network, res = train(x_train, y_train, train_size=args.train_size, x_validation=x_validation, y_validation=y_validation, validation_size=args.validation_size, network=network, epochs=args.epochs, batch_size=args.batch_size, lr=args.lr, checkpoints=args.checkpoints, save_path=model_save_file, loss_function=args.loss_function, l2_penalty=args.l2_penalty, optimizer=args.optimizer ) # Save the results of the accuracy and loss during training save_loss_acc(model_save_file, filename, res) # Make sure don't mess with our min/max of the spread network if isinstance(network, SpreadNet): network.training = False # Save the final model save_model(network, model_save_file)
import math import naive_bayes import util data_set = util.load_data_set('datasets/Compound.csv') if __name__ == "__main__": # compare & visualize between dataset and evaluated data util.visualize(data_set) # Make Naive Bayes Classifier (Gaussian) classifier = naive_bayes.NaiveBayes(data_set) # Evaluate date set & make confusion matrix evaluated_data, confusion_matrix = classifier.evaluate(data_set) # compare & visualize between dataset and evaluated data util.compare_data(data_set, evaluated_data) # performance calculation (accuracy) # print "Accuracy : {}".format(util.performance_calculation(confusion_matrix)) # performance calculation (f1_score) util.performance_calculation(confusion_matrix, mode="f1_micro_average") # util.decision_boundary(data_set, classifier)
def decision_tree_pruning_car(): car_data = load_data_set('car') car_ohe = preprocessing.OneHotEncoder() car_ohe.fit(car_data['train']['inputs'] + car_data['test']['inputs']) # encode features as one-hot clf = tree.DecisionTreeClassifier( criterion="gini", splitter="random", ) with Timer() as t: clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict(car_ohe.transform(car_data['train']['inputs'])) train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict(car_ohe.transform(car_data['test']['inputs'])) test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs']) data_out = car_data['train']['outputs'] + car_data['test']['outputs'] t_out = car_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("car.dataset (no pruning)") print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: Decision Trees (car.dataset, no pruning)", cv=5) plt.savefig('out/decision_tree_pruning/car-noprune-learning.png') export_decision_tree(clf, 'car-noprune') clf = tree.DecisionTreeClassifier( criterion="gini", splitter="random", min_samples_leaf=5, # minimum of 5 samples at leaf nodes max_depth=9 ) with Timer() as t: clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict(car_ohe.transform(car_data['train']['inputs'])) train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict(car_ohe.transform(car_data['test']['inputs'])) test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs']) data_out = car_data['train']['outputs'] + car_data['test']['outputs'] t_out = car_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("car.dataset (pruned)") print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: Decision Trees (car.dataset, pruned)", cv=5) plt.savefig('out/decision_tree_pruning/car-prune-learning.png') export_decision_tree(clf, 'car-prune')
import numpy as np import os.path import params import util from neural_network import MLPNeuralNetwork from matplotlib import pyplot as plt # Load data set & Normalization data, labels = util.load_data_set(params.data_set, label_separated=True) data, labels = util.normalization(np.array(data)), np.array(labels) data_set = (data, labels) # Make classifier classifier = MLPNeuralNetwork(params.hidden_layer, params.output_layer, params.feature_dim, params.learning_rate) # Load weight data if exist if os.path.exists("training_data/W1.npy") and os.path.exists("training_data/W2.npy") \ and os.path.exists("training_data/B1.npy") and os.path.exists("training_data/B2.npy"): classifier.W1 = np.load("training_data/W1.npy") classifier.W2 = np.load("training_data/W2.npy") classifier.B1 = np.load("training_data/B1.npy") classifier.B2 = np.load("training_data/B2.npy") # Training classifier minimum_error = 0.2 error = 100.0 acc = 0.0 if os.path.exists("training_data/accuracy_visual.npy") and os.path.exists("training_data/mse_visual.npy") \ and os.path.exists("training_data/epoch.npy"):
def decision_tree_pruning_cancer(): cancer_data = load_data_set('breastcancer') cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean') cancer_imp.fit(np.array(cancer_data['train']['inputs'] + cancer_data['test']['inputs'], dtype=np.float32)) clf = tree.DecisionTreeClassifier( criterion="gini", splitter="random" ) with Timer() as t: clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict(cancer_imp.transform(cancer_data['train']['inputs'])) train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict(cancer_imp.transform(cancer_data['test']['inputs'])) test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs']) data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs'] t_out = cancer_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("breastcancer.dataset (no pruning)") print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: Decision Trees (breastcancer.dataset, no pruning)", cv=5) plt.savefig('out/decision_tree_pruning/breastcancer-noprune-learning.png') export_decision_tree(clf, 'breastcancer-noprune') clf = tree.DecisionTreeClassifier( criterion="gini", splitter="random", min_samples_leaf=10, # minimum of 10 samples at leaf nodes max_depth=5 ) with Timer() as t: clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict(cancer_imp.transform(cancer_data['train']['inputs'])) train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict(cancer_imp.transform(cancer_data['test']['inputs'])) test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs']) data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs'] t_out = cancer_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("breastcancer.dataset (pruned)") print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: Decision Trees (breastcancer.dataset, pruned)", cv=5) plt.savefig('out/decision_tree_pruning/breastcancer-prune-learning.png') export_decision_tree(clf, 'breastcancer-prune')
def knn_cancer(k_value=1): cancer_data = load_data_set('breastcancer') cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean') cancer_imp.fit( np.array(cancer_data['train']['inputs'] + cancer_data['test']['inputs'], dtype=np.float32)) x = list() y_train = list() y_test = list() y_cross = list() # chart different k-values vs. f1 score first for i in range(30): _k = i + 1 clf = KNeighborsClassifier(n_neighbors=_k) clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs']) predicted = clf.predict( cancer_imp.transform(cancer_data['train']['inputs'])) train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro') predicted = clf.predict( cancer_imp.transform(cancer_data['test']['inputs'])) test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro') data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs']) data_out = cancer_data['train']['outputs'] + cancer_data['test'][ 'outputs'] cross_val = cross_val_score(clf, data_in, data_out, cv=5) x.append(_k) y_train.append(train_f1_score) y_test.append(test_f1_score) y_cross.append(np.mean(cross_val)) plt.figure() plt.title('Scores for various k (breastcancer.dataset)') plt.xlabel('k value') plt.ylabel('Score') plt.plot(x, y_train, label='Training F1 score') plt.plot(x, y_test, label='Testing F1 score') plt.plot(x, y_cross, label='Cross-validation score') plt.legend() plt.savefig('out/knn/breastcancer-k-testing.png') # chart with given k-value for detail clf = KNeighborsClassifier(n_neighbors=k_value) with Timer() as t: clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict( cancer_imp.transform(cancer_data['train']['inputs'])) train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict( cancer_imp.transform(cancer_data['test']['inputs'])) test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs']) data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs'] t_out = cancer_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("breastcancer.dataset (k={})".format(k_value)) print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: kNN (breastcancer.dataset, k={})".format( k_value), cv=5) plt.savefig('out/knn/breastcancer-k-{}.png'.format(k_value))
def knn_car(k_value=1): car_data = load_data_set('car') car_ohe = preprocessing.OneHotEncoder() car_ohe.fit(car_data['train']['inputs'] + car_data['test']['inputs']) # encode features as one-hot x = list() y_train = list() y_test = list() y_cross = list() # chart different k-values vs. f1 score first for i in range(30): _k = i + 1 clf = KNeighborsClassifier(n_neighbors=_k) clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs']) predicted = clf.predict(car_ohe.transform(car_data['train']['inputs'])) train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro') predicted = clf.predict(car_ohe.transform(car_data['test']['inputs'])) test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro') data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs']) data_out = car_data['train']['outputs'] + car_data['test']['outputs'] cross_val = cross_val_score(clf, data_in, data_out, cv=5) x.append(_k) y_train.append(train_f1_score) y_test.append(test_f1_score) y_cross.append(np.mean(cross_val)) plt.figure() plt.title('Scores for various k (car.dataset)') plt.xlabel('k value') plt.ylabel('Score') plt.plot(x, y_train, label='Training F1 score') plt.plot(x, y_test, label='Testing F1 score') plt.plot(x, y_cross, label='Cross-validation score') plt.legend() plt.savefig('out/knn/car-k-testing.png') clf = KNeighborsClassifier(n_neighbors=k_value) with Timer() as t: clf.fit(car_ohe.transform(car_data['train']['inputs']), car_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict(car_ohe.transform(car_data['train']['inputs'])) train_f1_score = metrics.f1_score(car_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict(car_ohe.transform(car_data['test']['inputs'])) test_f1_score = metrics.f1_score(car_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = car_ohe.transform(car_data['train']['inputs'] + car_data['test']['inputs']) data_out = car_data['train']['outputs'] + car_data['test']['outputs'] t_out = car_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("car.dataset (k={})".format(k_value)) print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: kNN (car.dataset, k={})".format(k_value), cv=5) plt.savefig('out/knn/car-k-{}.png'.format(k_value))
import tensorflow as tf from cnn import Cnn import config import util x_train_orig, y_train_orig, x_test_orig, y_test_orig, classes = util.load_data_set( ) x_train = util.pre_treat(x_train_orig) x_test = util.pre_treat(x_test_orig) y_train = util.pre_treat(y_train_orig, is_x=False, class_num=len(classes)) y_test = util.pre_treat(y_test_orig, is_x=False, class_num=len(classes)) cnn = Cnn(config.conv_layers, config.fc_layers, config.filters, config.learning_rate, config.beta1, config.beta2) (m, n_H0, n_W0, n_C0) = x_train.shape n_y = y_train.shape[1] # construction calculation graph cnn.initialize(n_H0, n_W0, n_C0, n_y) cnn.forward() cost = cnn.cost() optimizer = cnn.get_optimizer(cost) predict, accuracy = cnn.predict() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for i in range(1, config.num_epochs + 1):
def train(): test_model = ensemble.GradientBoostingClassifier() person_table, condition_occurrence_table, outcome_cohort_table, measurement_table = util.load_data_set(TRAIN_DIR) measurement_table = util.preprocess_measurement(measurement_table) test_model = util.train_model(test_model,person_table, condition_occurrence_table, measurement_table, outcome_cohort_table) pickle.dump(test_model, open(os.path.join(VOL_DIR,'model.dat'),'wb')) # 데이터 입력