def cros_validate(number_lambd,X_train,Y_train,X_test,Y_test,X_shuffle_train,Y_shuffle_train): lambda_vals = np.logspace(-4, -1, number_lambd) test_error = np.zeros(number_lambd) train_error = np.zeros(number_lambd) for j, lambd in enumerate(lambda_vals): beta, v = clss.svm(X_train, Y_train, lambd) test_error[j] = clss.svm_test(X_test, Y_test, beta, v) train_error[j] = clss.svm_test(X_train, Y_train, beta, v) shuffle_test_error = np.zeros(number_lambd) shuffle_train_error = np.zeros(number_lambd) for j, lambd in enumerate(lambda_vals): beta2, v2 = clss.svm(X_shuffle_train, Y_shuffle_train, lambd) shuffle_test_error[j] = clss.svm_test(X_test, Y_test, beta2, v2) shuffle_train_error[j] = clss.svm_test(X_shuffle_train, Y_shuffle_train, beta2, v2) return test_error, shuffle_test_error
def cros_validate(number_lambd, X_train, Y_train, X_test, Y_test, X_shuffle_train, Y_shuffle_train): lambda_vals = np.logspace(-4, -1, number_lambd) test_error = np.zeros(number_lambd) train_error = np.zeros(number_lambd) for j, lambd in enumerate(lambda_vals): beta, v = clss.svm(X_train, Y_train, lambd) test_error[j] = clss.svm_test(X_test, Y_test, beta, v) train_error[j] = clss.svm_test(X_train, Y_train, beta, v) shuffle_test_error = np.zeros(number_lambd) shuffle_train_error = np.zeros(number_lambd) for j, lambd in enumerate(lambda_vals): beta2, v2 = clss.svm(X_shuffle_train, Y_shuffle_train, lambd) shuffle_test_error[j] = clss.svm_test(X_test, Y_test, beta2, v2) shuffle_train_error[j] = clss.svm_test(X_shuffle_train, Y_shuffle_train, beta2, v2) return test_error, shuffle_test_error
def test_svm(df): # Tabela de resultados do experimento table = {} x = df[['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age']] y = df['class'].transform(lambda k: 1 if bool(k) else -1) for t in range(60, 91, 3): table[t] = [] for i in range(1, 41): print(f"Executing: treino={t}% iteração={i} ...") xtrain, xtest, ytrain, ytest = train_test_split( x, y, test_size=((100 - t) / 100), random_state=None, stratify=y) results = svm(xtrain, ytrain, xtest, ytest) print(f"Acurácia: {results.accurracy()}") table[t].append(results.accurracy()) table[t] = { 'min': np.min(table[t]), 'mean': np.mean(table[t]), 'max': np.max(table[t]) } return table
import numpy as np from classifier import svm, svm_2, svm_test lambd = .01 X = np.random.rand(4,3) Y = np.array([0,1,2,3]) #np.array([[1,1],[0,0],[2,3]]) #Y = np.array([1,0,2]) beta, v = svm(X,Y,lambd) #Y2 = np.array([1,-1]) #beta2, v2 = svm_2(X,Y2,lambd) test_error = svm_test(X,Y,beta,v) print(test_error)
import numpy as np from classifier import svm, svm_2, svm_test lambd = .01 X = np.random.rand(4, 3) Y = np.array([0, 1, 2, 3]) #np.array([[1,1],[0,0],[2,3]]) #Y = np.array([1,0,2]) beta, v = svm(X, Y, lambd) #Y2 = np.array([1,-1]) #beta2, v2 = svm_2(X,Y2,lambd) test_error = svm_test(X, Y, beta, v) print(test_error)
svm = config["svm_linear"] elif svm_kernel == "rbf": svm = config["svm_rbf"] else: print("Invalid kernel for svm") exit() print(svm) model = CNNModel(signet, svm["model_path"]) images_dictionary = {} list_of_signatures_use_on_train = [] list_of_signatures_use_on_test = [] weights = {1: config["c-plus"], 0: svm["c-minus"]} svc_linear = classifier.svm(gamma='auto', weights=weights, kernel="linear") print(svc_linear) svc_rbf = classifier.svm(gamma=2**(-11), weights=weights, kernel="rbf") print(svc_rbf) mlp = classifier.mlp(0.0001, (100, 500)) print(mlp) knn = classifier.knn(3, "uniform") print(knn) tree = classifier.tree(weights, "log2", "gini", 0.0000001) print(tree) random_users = get_signature_folders(config["dataset_for_random_path"]) print("Loading list for random users to train") train_config = config["train_config"]
def DAE_trainer(learning_rate=1e-3, batch_size=100, num_epoch=10, hidden_layers=[7, 4, 2], input_dim=0, step=20, X_train=[], X_test=[], Y_train=[], Y_test=[], dt=[], noise_factor=0.25): model1 = DAE(learning_rate=learning_rate, batch_size=batch_size, hidden_layers=hidden_layers, input_dim=input_dim, noise_factor=noise_factor) for epoch in range(num_epoch): num_sample = len(X_train) for iter in range(num_sample // batch_size): X_mb, _ = dt.train.next_batch(batch_size) # Execute the forward and the backward pass and report computed losses recon_loss = model1.run_single_step(X_mb) if epoch % step == 0: chartcolumn.take_display_traning('Epoch ' + str(epoch) + ' Recon loss: ' + str(recon_loss)) # model.writer.add_summary(summary, epoch ) z_train = model1.transformer(X_train) s = time.time() z_test = model1.transformer(X_test) e = time.time() t_tr = (e - s) / float(len(X_test)) auc_svm, t1 = classifier.svm(z_train, Y_train, z_test, Y_test) auc_dt, t2 = classifier.decisiontree(z_train, Y_train, z_test, Y_test) auc_rf, t3 = classifier.rf(z_train, Y_train, z_test, Y_test) auc_nb, t4 = classifier.naive_baves(z_train, Y_train, z_test, Y_test) auc_kn, t5 = classifier.KNeighbors(z_train, Y_train, z_test, Y_test) auc_logistic, t6 = classifier.Logistic(z_train, Y_train, z_test, Y_test) DAE_recon_loss_.append(recon_loss) DAE_auc_svm_.append(auc_svm) DAE_auc_dt_.append(auc_dt) DAE_auc_rf_.append(auc_rf) DAE_auc_nb_.append(auc_nb) DAE_auc_kn_.append(auc_kn) DAE_auc_logistic_.append(auc_logistic) DAE_t1_.append((t1 + t_tr)) DAE_t2_.append((t2 + t_tr)) DAE_t3_.append((t3 + t_tr)) DAE_t4_.append((t4 + t_tr)) DAE_t5_.append((t5 + t_tr)) DAE_t6_.append((t6 + t_tr)) print('Done DAE!') return model1
def VAE_trainer(learning_rate=1e-3, batch_size=100, num_epoch=10, hidden_layers=[7, 4, 2], input_dim=0, step=20, X_train=[], X_test=[], Y_train=[], Y_test=[], dt=[]): model = VAE(learning_rate=learning_rate, batch_size=batch_size, hidden_layers=hidden_layers, input_dim=input_dim) for epoch in range(num_epoch): num_sample = len(X_train) for iter in range(num_sample // batch_size): X_mb, _ = dt.train.next_batch(batch_size) # Execute the forward and the backward pass and report computed losses loss, recon_loss, latent_loss = model.run_single_step(X_mb) if epoch % step == 0: print( '[Epoch {}] Loss: {}, Recon loss: {}, Latent loss: {}'.format( epoch, loss, recon_loss, latent_loss)) chartcolumn.take_display_traning('Epoch ' + str(epoch) + ' Loss: ' + str(loss) + ' Recon loss: ' + str(recon_loss) + ' Latent loss: ' + str(latent_loss)) z_train = model.transformer(X_train) s = time.time() z_test = model.transformer(X_test) e = time.time() t_tr = (e - s) / len(X_test) # np.savetxt(path + "z_train_"+str(epoch)+".csv", z_train, delimiter=",", fmt='%f' ) # np.savetxt(path + "z_test_"+str(epoch)+".csv", z_train, delimiter=",", fmt='%f' ) auc_svm, t1 = classifier.svm(z_train, Y_train, z_test, Y_test) auc_dt, t2 = classifier.decisiontree(z_train, Y_train, z_test, Y_test) auc_rf, t3 = classifier.rf(z_train, Y_train, z_test, Y_test) auc_nb, t4 = classifier.naive_baves(z_train, Y_train, z_test, Y_test) VAE_loss_.append(loss) VAE_recon_loss_.append(recon_loss) VAE_latent_loss_.append(latent_loss) VAE_auc_svm_.append(auc_svm) VAE_auc_dt_.append(auc_dt) VAE_auc_rf_.append(auc_rf) VAE_auc_nb_.append(auc_nb) VAE_t1_.append((t1 + t_tr)) VAE_t2_.append((t2 + t_tr)) VAE_t3_.append((t3 + t_tr)) VAE_t4_.append((t4 + t_tr)) print('Done VAE!') return model
def make_init_deep_learning(data_index, pathTrain, pathTest, pathColumn, AUC_and_Structure): datasets = np.asarray([ "unsw", "ctu13_8", "Ads", "Phishing", "IoT", "Spam", "Antivirus", "VirusShare", 'nslkdd' ]) #30 ? 57 513 482 dataname = datasets[data_index] dt = shuttle.read_data_sets(dataname, pathTrain, pathTest, pathColumn) num_sample = dt.train.num_examples chartcolumn.take_display_traning("size of dataset: " + str(num_sample)) input_dim = dt.train.features.shape[1] balance_rate = len(dt.train.features1) / float(len(dt.train.features0)) label_dim = dt.train.labels.shape[1] chartcolumn.take_display_traning("dimension: " + str(input_dim)) chartcolumn.take_display_traning("number of class: " + str(label_dim)) data_save = np.asarray([data_index, input_dim, balance_rate, label_dim]) data_save = np.reshape(data_save, (-1, 4)) if os.path.isfile("Results/Infomation/" + dataname + "/datainformation.csv"): # auc = np.genfromtxt("Results/Infomation/" + dataname + "/datainformation.csv", delimiter=',') auc = np.reshape(auc, (-1, 4)) data_save = np.concatenate((auc, data_save), axis=0) np.savetxt("Results/Infomation/" + dataname + "/datainformation.csv", data_save, delimiter=",", fmt="%f") else: np.savetxt("Results/Infomation/" + dataname + "/datainformation.csv", data_save, delimiter=",", fmt="%f") num_epoch = 2000 step = 20 # filter_sizes = np.asarray([[1, 2, 3], [3, 5, 7], [1, 2, 3], [7, 11, 15], [1, 2, 3], [3, 5, 7], [1, 2, 3]]) #data_shapes = np.asarray([[12, 12], [14, 14], [8, 8], [40, 40], [9, 9], [25, 25], [8, 8]]) hidden_layer = hidden_layers[data_index] block_size = batch_sizes[data_index] lr = 1e-4 noise_factor = 0.0025 # 0, 0.0001, 0,001, 0.01, 0.1, 1.0 # filter_size = filter_sizes[data_index] # [1,2,3] #data_shape = data_shapes[data_index] # [12,12] conf = str(num_epoch) + "_" + str(block_size) + "_" + str(lr) + "_" + str( hidden_layer[0]) + "_" + str(hidden_layer[1]) + "_" + str( hidden_layer[2]) + "_noise: " + str(noise_factor) X_train = dt.train.features Y_train = dt.train.labels X_test = dt.test.features Y_test = dt.test.labels svm, t1 = classifier.svm(X_train, Y_train, X_test, Y_test) auc_dt, t2 = classifier.decisiontree(X_train, Y_train, X_test, Y_test) rf, t3 = classifier.rf(X_train, Y_train, X_test, Y_test) nb, t4 = classifier.naive_baves(X_train, Y_train, X_test, Y_test) kn, t5 = classifier.KNeighbors(X_train, Y_train, X_test, Y_test) logistic, t6 = classifier.Logistic(X_train, Y_train, X_test, Y_test) data_save = np.asarray([ data_index, input_dim, balance_rate, svm, auc_dt, rf, nb, 1000 * t1, 1000 * t2, 1000 * t3, 1000 * t4 ]) data_save = np.reshape(data_save, (-1, 11)) AUC_and_Structure.append(svm) AUC_and_Structure.append(auc_dt) AUC_and_Structure.append(rf) AUC_and_Structure.append(nb) AUC_and_Structure.append(kn) AUC_and_Structure.append(logistic) if os.path.isfile("Results/RF_AUC_DIF/" + dataname + "/AUC_Input.csv"): # auc = np.genfromtxt("Results/RF_AUC_DIF/" + dataname + "/AUC_Input.csv", delimiter=',') auc = np.reshape(auc, (-1, 11)) data_save = np.concatenate((auc, data_save), axis=0) np.savetxt("Results/RF_AUC_DIF/" + dataname + "/AUC_Input.csv", data_save, delimiter=",", fmt="%f") else: np.savetxt("Results/RF_AUC_DIF/" + dataname + "/AUC_Input.csv", data_save, delimiter=",", fmt="%f") return data_index, input_dim, balance_rate, lr, block_size, num_epoch, hidden_layer, \ step, X_train, X_test, Y_train, Y_test, dt, label_dim, noise_factor, conf, \ dataname
count_s += 1 correct_class.append(0) if (not is_mcyt): dataset_folders = os.listdir(dataset_path) dataset_folders_filtered = filter(filter_dataset_folders, dataset_folders) dataset_folders_sample = random.sample(dataset_folders_filtered, 10) print("Adding Random to test set (Only for GPDS's dataset)") for p in dataset_folders_sample: f = os.listdir(dataset_path + p) # Load and pre-process the signature f_filtered = filter(filter_genuine, f) f_sample = random.sample(f_filtered, 1)[0] filename = os.path.join(dataset_path + p, f_sample) original = imread(filename, flatten=1) processed = preprocess_signature(original, canvas_size) # Use the CNN to extract features feature_vector = model.get_feature_vector(processed) data.append(feature_vector[0]) correct_class.append(0) data_test = np.array(data) print("Correctly data test classes: ") print(correct_class) classifier.knn(data_train, data_test, expected, correct_class) classifier.svm(data_train, data_test, expected, correct_class) classifier.mlp(data_train, data_test, expected, correct_class) classifier.tree(data_train, data_test, expected, correct_class)
def main(): p = Path("./result") if not p.exists(): os.makedirs(p) parser = argparse.ArgumentParser( description='Bioinf project. The arguments can be passed in any order.' ) classes = parser.add_mutually_exclusive_group() classes.add_argument('-cl2', help='in order to classify two cancer types.', action='store_true') classes.add_argument( '-cl3', help='in order to classify two cancer types AND sane.', action='store_true') classifier = parser.add_mutually_exclusive_group() classifier.add_argument('-svm', help='train a Support Vector Machine classifier', action='store_true') classifier.add_argument('-knn', help='train a K Nearest Neighbors classifier', action='store_true') classifier.add_argument('-rforest', help='train a Random Forest classifier', action='store_true') classifier.add_argument('-kmeans', help='train a Kmeans clustering', action='store_true') classifier.add_argument( '-hierarc', help='train an Agglomerative Hierarchical clustering', action='store_true') inbalance = parser.add_mutually_exclusive_group() inbalance.add_argument('-over', help='imbalance: Random Oversampling ', action='store_true') inbalance.add_argument('-smote', help='imbalance: SMOTE', action='store_true') preprocess = parser.add_mutually_exclusive_group() preprocess.add_argument( '-ttest', help= 'feature selection: ttest per chromosoma and per cpg site - 2 classes', action='store_true') preprocess.add_argument( '-fisher', help='feature selection: fisher criterion - 3 classes', action='store_true') preprocess.add_argument('-anova', help='feature selection: anova - 3 classes', action='store_true') preprocess.add_argument( '-pca', help='dimensionality reduction: Principal Component Analisys', action='store_true') preprocess.add_argument( '-lda', help='dimensionality reduction: Linear Discriminant Analysis', action='store_true') preprocess.add_argument( '-sfs', help= 'feature selection - wrapper: Step Forward Selection (nearly unfeasible)', action='store_true') preprocess.add_argument( '-ga', help='feature selection - wrapper: Genetic Algorithm', action='store_true') parser.add_argument( '-d', '--download', nargs=2, help='download Adenoma and Adenocarcinoma and Squamous Cell Neoplasm ' + 'data from Genomic Data Common. It needs 2 parameters: ' + 'first parameter is the destination folder; ' + 'second parameters is the number of files to be downloaded for each class ', action='store') parser.add_argument( '-ds', '--downloadsane', nargs=2, help='download Sane data from Genomic Data Common' + 'It needs 2 parameters: ' + 'first parameter is the destination folder; ' + 'second parameters is the number of files to be downloaded ', action='store') parser.add_argument( '-s', '--store', help= 'concatenate files belonging to same cancer type and store them in a binary file', action='store') parser.add_argument( '--alpha', type=float, default=0.001, help='to set a different ALPHA: ttest parameter - default is 0.001', action='store') parser.add_argument( '--perc', type=float, default=0.95, help='to set PERC of varaince explained by the features kept by PCA', action='store') parser.add_argument( '-rs', '--r_state', type=int, default=8, help='to set a user defined Random State - default is 8', action='store') parser.add_argument('--only_chrms_t', default=False, help='select only chrms for ttest', action='store_true') parser.add_argument( '--crossval', help= 'to do crossvalidation OR in case of unsupervised to plot the Inertia curve', action='store_true') parser.add_argument('--plot_lc', help='plot the learning curve', action='store_true') parser.add_argument( '--remove_nan_cpgs', type=str2bool, default=True, help='IF True: removes features containing at least one NaN value. ' + 'IF False: NaN are substituted by the mean over the feature. ' + 'The old file resulted by feature reduction must be eliminated when changing option. ' + 'By Default is True.', action='store') args = parser.parse_args() if args.download: print("download ") dgdc.getDataEx(path=args.download[0], file_n=args.download[1]) if args.downloadsane: print("download sane ") dgdc.getSaneDataEx(path=args.downloadsane[0], file_n=args.downloadsane[1]) if args.store: print("store") dgdc.storeDataIntoBinary(path=args.store) print("Data stored.") # validity checks if not args.cl2 and not args.cl3: print( "insert arg -cl2 for classifying 2 classes OR -cl3 for 3 classes") return # parameters and variables alpha = args.alpha # alpha parameter for t-test perc = args.perc # percentage of variance explained classes = 2 if args.cl2 else 3 random_state = args.r_state no_nan = args.remove_nan_cpgs n_components = 100 cl.setPlot_lc(args.plot_lc) cl.addToName("cl{}".format(classes)) cl.addToName("rs{}".format(random_state)) # load data print("Loading....") x, y, chrms_pos = pr.loadData(classes=classes) if no_nan: cl.addToName("no_nan") length = x.shape[1] x = pr.removeNanFeature(x) print("{} NaN features removed!".format(length - x.shape[1])) print("Loaded!") x_train, x_test, y_train, y_test = sk.model_selection.train_test_split( x, y, test_size=0.2, random_state=random_state) del x, y # preprocess if args.ttest: if classes != 2: print("wrong number of classes") return #print("Start ttest axis={}....".format(args.ttest)) r, cpg_r = pr.compute_t_test(x_train, y_train, chrms_pos, alpha, random_state, axis=0, remove_nan=no_nan) print(r) cl.addToName("ttest{}".format(args.ttest)) length = x_train.shape[1] x_train, x_test = pr.removeFeatures(x_train, x_test, cpg_r, chrms_pos, args.only_chrms_t, remove_nan=no_nan, y_train=y_train) print("Features removed: {}".format(length - x_train.shape[1])) print("End ttest!") if args.ga: print("genetic algorithm") cl.addToName("ga") # per lavorare con meno componenti # x_train = x_train[:, 1:100] result = g.GA_function(x_train, y_train, random_state, classes, 0.1) path = Path('./data/GA_{}_{}.npy'.format(random_state, classes)) np.save(path, result) x_train = x_train[:, result] x_test = x_test[:, result] if args.pca: print("pca") cl.addToName("pca") x_train, x_test = pr.pca_function(x_train, x_test, y_train, y_test, classes, perc, random_state, name=cl.name, remove_nan=no_nan) if args.lda: #print("lda - {} components".format(args.lda)) cl.addToName("lda") x_train, x_test = pr.lda_function(x_train, x_test, y_train, y_test, classes, args.lda, random_state, cl.name) if args.fisher: if classes != 2: print("wrong number of classes") return #cl.addToName("fisher{}".format(args.fisher)) cl.addToName("fisher") print("fisher") x_train, x_test = pr.fisher_function(x_train, x_test, y_train, y_test, random_state, best=True, n=n_components, remove_nan=no_nan) # if best=True selects the n best features, if False the worst n features (for debugging) if args.sfs: if classes != 2: print("wrong number of classes") return print("Start sfs....") feat_col = pr.sfs(x_train, x_test, y_train, y_test, chrms_pos, alpha, random_state) x_train = x_train[:, feat_col] x_test = x_test[:, feat_col] if args.anova: if classes != 3: print("wrong number of classes") return print("anova") cl.addToName("anova") x_train, x_test = pr.anova_function(x_train, x_test, y_train, y_test, alpha, random_state, remove_nan=no_nan) # imbalance if args.over: print("over ") x_train, y_train = pr.imbalance(x_train, y_train, "over", random_state) cl.addToName("over") if args.smote: print("smote ") x_train, y_train = pr.imbalance(x_train, y_train, "smote", random_state) cl.addToName("smote") cl.random_state(random_state) # classify if args.svm: print("svm ") cl.svm(x_train, x_test, y_train, y_test, classes=classes, crossval=args.crossval) if args.knn: print("knn ") cl.knn(x_train, x_test, y_train, y_test, classes=classes, crossval=args.crossval) if args.rforest: print("rforest") cl.random_forest(x_train, x_test, y_train, y_test, classes=classes, crossval=args.crossval) if args.kmeans: print("kmeans") uc.kmeans(x_train, x_test, y_train, y_test, classes=classes, random_state=random_state, crossval=args.crossval) if args.hierarc: print("hierarchical clustering") uc.hierarchical(x_train, x_test, y_train, y_test, classes=classes, random_state=random_state, crossval=args.crossval) print("Log name: {}.log".format(cl.name)) handlers = log.getLogger().handlers[:] for handler in handlers: handler.close() log.getLogger().removeHandler(handler) nf = p / cl.name if not nf.exists(): os.makedirs(nf) npath = Path(nf / '{}.log'.format(cl.name)) i = 1 while npath.exists(): npath = Path(nf / '{}_{}.log'.format(cl.name, i)) i += 1 os.rename('log.log', npath)
test_classification.append(1) for k in range(option[0] + option[1]): test_classification.append(0) # metrics = classifier.knn(np.array(train_sets_processed[i]), test, classifications[i], test_classification, genuine_quantity, option[0], option[1]) # frr_metrics[0].append(metrics[0]) # far_skilled_metrics[0].append(metrics[1]) # far_random_metrics[0].append(metrics[2]) # eer_metrics[0].append(metrics[3]) # metrics = classifier.tree(np.array(train_sets_processed[i]), test, classifications[i], test_classification, genuine_quantity, option[0], option[1]) # frr_metrics[1].append(metrics[0]) # far_skilled_metrics[1].append(metrics[1]) # far_random_metrics[1].append(metrics[2]) # eer_metrics[1].append(metrics[3]) metrics = classifier.svm(np.array(train_sets_processed[i]), test, classifications[i], test_classification, genuine_quantity, option[0], option[1], weights=svm_weights[i]) frr_metrics[2].append(metrics[0]) far_skilled_metrics[2].append(metrics[1]) far_random_metrics[2].append(metrics[2]) eer_metrics[2].append(metrics[3]) frr_metrics_global.append(metrics[4]) far_skilled_metrics_global.append(metrics[5]) far_random_metrics_global.append(metrics[6]) eer_metrics_global.append(metrics[7]) frr_metrics_global_sd.append(metrics[4]) far_skilled_metrics_global_sd.append(metrics[5]) far_random_metrics_global_sd.append(metrics[6]) eer_metrics_global_sd.append(metrics[7])