def validation_curve(): # Test decision tree using cross validation # Preprocess data data = pd.read_csv('./arrhythmia.data', header = None, na_values = '?') data = fill_na(data = data) features = data.columns.tolist()[:-1] target = data.columns.tolist()[-1] feature_types = implicit_feature_type_inferrence(data = data[features], num_unique_values = 3) train_set, test_set = train_test_split(data = data, train_fraction = 0.8, reindex = False, random_seed = 0) max_depth_cv = list() training_error_cv = list() test_error_cv = list() # Start cross-validation for i in range(2,21,2): tree_max_depth = i print("Tree Max Depth: %d" %tree_max_depth) max_depth_cv.append(tree_max_depth) tree = DecisionTree(tree_max_depth) training_error, test_error = cross_validation(data = data, features = features, target = target, feature_types = feature_types, model = tree, fold = 3, random_seed = 0) training_error_cv.append(training_error) test_error_cv.append(test_error) print("Training Error: %f" %training_error) print("Test Error: %f" %test_error) plot_curve(max_depth = max_depth_cv, training_error = training_error_cv, test_error = test_error_cv)
def cross_validation(k, methods, Cs_NLK, Cs_SVM, degrees, lambdas): """ Apply cross-validation on NLCK algorithm. A first cross-validation is done on the values of C, d and lambda in NLCK, in order to find the optimal non-linear combination of kernels along with C, d, lambda. Then, for each triplet (and hence the corresponding weights vector), cross validation is done on the regularization constant of C_SVM, C. :param k: int, which dataset to use (k=1, 2 or 3) :param methods: list of string, kernel methods :param Cs_NLK: np.array, regularization constants in NLCK algorithm :param Cs_SVM: np.array, regularization constants in C_SVM algorithm :param degrees: np.array, degrees to explore (usually np.range(1, 5)) :param lambdas: np.array, lambdas (corresponding to parameter 'fnorm' in NLCK) to explore :return: pd.DataFrame with the following columns: - 'methods': kernels method used - 'C_NLCK': regularization constants in NLCK algorithm - 'd': degree in NLCK algorithm - 'lambda': normalization parameter in NLCK algorithm - 'Best C CSVM': best regularization constant in CSVM after cross validation - 'val acc': accuracy obtained on validation set """ # Load data data, data1, data2, data3, kernels, ID = utils.get_all_data(methods) data_k = [data1, data2, data3] # Initialize results DataFrame p = len(kernels) n_param = len(Cs_NLK) * len(degrees) * len(lambdas) init = np.zeros(n_param) results = pd.DataFrame({ 'methods': [methods] * len(init), 'C NLCK': init, 'd': init, 'lambda': init, 'Best C CSVM': init, 'val acc': init }) # Reformat X_train, y_train, X_val, y_val, X_test, kernels, ID = utils.reformat_data( data_k[k - 1], kernels, ID) # Start cross validation on triplet (C, d, lambda) for i, param in tqdm(enumerate(product(Cs_NLK, degrees, lambdas)), total=n_param): C, d, lbda = param print('NLCK C={}, degree={}, lambda={}'.format(C, d, lbda)) # Compute kernel Km = NLCK(X_train, y_train, ID, kernels, C=C, eps=1e-9, degree=d).get_K(fnorm=lbda) # Cross validation on constant C of C-SVM C_opt, scores_tr, scores_te, mean_scores_tr, mean_scores_te = \ utils.cross_validation(Ps=Cs_SVM, data=[X_train, y_train, X_val, y_val, X_test], algo='CSVM', kfolds=3, K=Km, ID=ID, pickleName='cv_C_SVM_NLCK_C{}_d{}_l{}_p{}_k{}.pkl'.format(C, d, lbda, p, k)) # Save results results.iloc[i, 1:6] = C, d, lbda, C_opt, np.max(mean_scores_te) return results
def cross_validate(self): r_correlation, r2_cv, rmse_cv, bias, predicted_values = cross_validation(self.model, self._xCal, self._yCal, self._cv, correlation_based=False) method = 'Leave One Out' if isinstance(self._cv, KFold): method = "{}-fold".format(self._cv.n_splits) cross_validation_metrics = {'R': r_correlation, 'R2': r2_cv, 'RMSE': rmse_cv, 'bias': bias, 'method': method, 'predicted_values': predicted_values } self.metrics['cross_validation'] = cross_validation_metrics
def test(dataset, trials=1): print("Testing " + dataset.name + " dataset:") print("") for i in range(4): if i == 0: print("Testing with no missing values") fold_errT, fold_errV = cross_validation(dataset, k=10, trials=trials) elif i == 1: print("Testing with 10% missing values") dataset_ = remove_values(dataset, p=0.1) handle_missing_values(dataset_) fold_errT, fold_errV = cross_validation(dataset_, k=10, trials=trials) elif i == 2: print("Testing with 20% missing values") dataset_ = remove_values(dataset, p=0.2) handle_missing_values(dataset_) fold_errT, fold_errV = cross_validation(dataset_, k=10, trials=trials) else: print("Testing with 50% missing values") dataset_ = remove_values(dataset, p=0.5) handle_missing_values(dataset_) fold_errT, fold_errV = cross_validation(dataset_, k=10, trials=trials) fold_errT = '%.4f' % round(fold_errT * 100, 6) fold_errV = '%.4f' % round(fold_errV * 100, 6) print("Training errors: " + str(fold_errT) + "%") print("Validation errors: " + str(fold_errV) + "%") print("")
def run(f_in, f_out, f_in_d): if args.dataset == 0: pose_feats, d_list, labels = parse_feats(f_in, f_out, f_in_d, args.depth, args.oversampling) if args.oversampling == False: #pose_feats, d_list, labels = sample(pose_feats, d_list, labels) pose_feats, d_list, labels = sample(pose_feats, d_list, labels) test, train, gt_test, gt_train, depth_train, depth_test = cross_validation( pose_feats, d_list, labels) np.save(f_out + 'train', train) np.save(f_out + 'labels_train', gt_train) np.save(f_out + 'depth_train', depth_train) np.save(f_out + 'test', test) np.save(f_out + 'labels_test', gt_test) np.save(f_out + 'depth_test', depth_test) print('Training and validation splits were saved in: ', f_out, '.') else: test, train, gt_test, gt_train, depth_train, depth_test = oversample( args.method, pose_feats, d_list, labels) if args.method == 1: np.save(f_out + 'train_oversampled_SMOTE', train) np.save(f_out + 'labels_train_oversampled_SMOTE', gt_train) np.save(f_out + 'depth_train_oversampled_SMOTE', depth_train) np.save(f_out + 'test_oversampled_SMOTE', test) np.save(f_out + 'labels_test_oversampled_SMOTE', gt_test) np.save(f_out + 'depth_test_oversampled_SMOTE', depth_test) elif args.method == 2: np.save(f_out + 'train_oversampled_ADASYN', train) np.save(f_out + 'labels_train_oversampled_ADASYN', gt_train) np.save(f_out + 'depth_train_oversampled_ADASYN', depth_train) np.save(f_out + 'test_for_ADASYN', test) np.save(f_out + 'labels_test_for_ADASYN', gt_test) np.save(f_out + 'depth_test_for__ADASYN', depth_test) print('Oversampled training and validation splits were saved in: ', f_out, '.') else: f_in_p = join(os.path.dirname(__file__), 'GRANADE_keypoints\\') #f_in_p = "E:\\GRANADE\\keypoints\\full\\" f_in_d_p = join(os.path.dirname(__file__), 'GRANADE_depth\\') #f_in_d_p = "E:\\GRANADE\\frames\\full_d\\" f_out_p = join(os.path.dirname(__file__), 'GRANADE_features\\') pose_feats, d_list = parse_feats_granade(f_in_p, f_out_p, f_in_d_p, args.depth, args.oversampling) pose_feats, d_list = norm_feats(pose_feats, d_list) np.save(f_out_p + 'test', pose_feats) np.save(f_out_p + 'depth_test', d_list) print('Granade test samples were saved in: ', f_out_p, '.')
def cross_validate(self): r_correlation, r2_cv, rmse_cv, bias, predicted_values = cross_validation( self._linear_regression, self._Xreduced, self._yCal, cv=self._cv) method = 'LOO' if isinstance(self._cv, int): method = "{}-fold".format(self._cv) cross_validation_metrics = { 'R2': r2_cv, 'RMSE': rmse_cv, 'method': method, 'predicted_values': predicted_values } self.metrics['cross_validation'] = cross_validation_metrics
clf = SVM(_lambda=_lambda, kernel=kernel) elif model_name == "SPR": clf = SPR(kernel=kernel) # Loop from pre-computed embeddings #for filename in os.listdir(EMBEDDING_DIR)[:1]: # small test for filename in os.listdir(EMBEDDING_DIR): # Full path file_path = os.path.join(EMBEDDING_DIR, filename) # Parsing dataset_idx, sigma, window_size = filename_parser(filename) # Cross validation results = cross_validation(dataset_idx=dataset_idx, clf=clf, data_dir=DATA_DIR, files_dict=FILES, k=5, embeddings_path=file_path, mat=True) # Process scores score_train = results["train_avg"] score_val = results["val_avg"] logger.info(f"Accuracy on train set / val set {dataset_idx} : {round(score_train, 3)} / {round(score_val, 3)}" f"(λ: {_lambda},γ: {gamma}, sigma: {sigma}, window_size: {window_size})") # Update best if score_val > best_score[dataset_idx]: best_score[dataset_idx] = score_val best_lambda[dataset_idx] = _lambda best_gamma[dataset_idx] = gamma best_sigma[dataset_idx] = sigma best_window_size[dataset_idx] = window_size logger.info("New best on {0}".format(dataset_idx))
args['notsherpa'] = 1 args['sherpa_trial'] = trial.id pp.pprint(args) if FLAGS.cam: print('Creating CAM heatmaps') cam(args) elif FLAGS.cm: from sklearn.metrics import confusion_matrix from utils import plot_confusion_matrix train_cm = np.zeros((2, 2)) test_cm = np.zeros((2, 2)) for fold, data, path_info in cross_validation(vars(FLAGS)): model = load_model('Models/%d/%05d.h5' % (FLAGS.sherpa_trial, fold + 1), custom_objects={'auc': auc}) x_train, x_test, y_train, y_test = data train, test = path_info train['split'] = 'train' test['split'] = 'test' x_train_samples, x_test_samples, = gen_samples(args, x_train, x_test, y_train, y_test) test_probabilities = model.predict(x_test_samples) train_probabilities = model.predict(x_train_samples)
print('Entropía del conjunto: ', data_set_entropy) # Separamos el data set en dos subconjuntos print() print('Se separa el data set en dos subconjuntos') splitted_data = utils.split_20_80(data_set) # Verificamos la correctitud de los tamaños print('Tamaño del data set original: ', str(len(data_set))) print('Tamaño del subset de validación: ', str(len(splitted_data[0]))) print('Tamaño del subset de entrenamiento: ', str(len(splitted_data[1]))) print() # Parte 1 print('Parte 1') # Se realiza cross-validation de tamaño 10 sobre el 80% del conjunto original. print('Se realiza 10-fold cross-validation') v_cs = utils.cross_validation(splitted_data[1], attributes, 'Class/ASD', 10) print('Promedio de error: ', v_cs) # Parte 2 print('Parte 2') print('Se realiza Hold out validation') # Se entrena con el 80% tree_6 = utils.ID3_algorithm(splitted_data[1], attributes, 'Class/ASD', False, False) # Se valida con el 20% v_ho = utils.validation(tree_6, splitted_data[0], 'Class/ASD') print('Resultado de la validación: ', v_ho)
elif args.maxent: from sklearn.linear_model import LogisticRegression clf = LogisticRegression model = "ME" # Load dataset if args.bbc: full_set, labels = np.array(utils.load_bbc_dataset()) dataset = "BBC" else: full_set, labels = np.array(utils.load_20news_dataset()) dataset = "20news" # Part the dataset into 10 folds num_folds = 10 thresholds = utils.cross_validation(full_set, num_folds) if args.t: fscore_top100 = Queue.Queue() fscore_feat = Queue.Queue() fscore_nofeat = Queue.Queue() else: fscore_top100 = np.zeros(num_folds) fscore_feat = np.zeros(num_folds) fscore_nofeat = np.zeros(num_folds) for fold in range(num_folds): print "Training and testing fold " + str(fold + 1) + "..." # Split dataset into train and set based on current fold train_set, train_labels, test_set, test_labels = utils.split_set( full_set, labels, thresholds[fold], thresholds[fold + 1])
import utils import sklearn.metrics TRAIN_IMAGES_FOLDER = "data/train" TEST_IMAGES_FOLDER = "data/test" IMG_EXTENSION = ".png" print("Cargando las imágenes de entrenamiento") train_images, train_classes = utils.load_data(TRAIN_IMAGES_FOLDER, IMG_EXTENSION) print("Cargando las imágenes de test") test_images, test_classes = utils.load_data(TEST_IMAGES_FOLDER, IMG_EXTENSION) print("Calculando los descriptores HOG") train_descriptors = utils.compute_hog(train_images) test_descriptors = utils.compute_hog(test_images) descriptors = np.vstack((train_descriptors, test_descriptors)) labels = np.concatenate((train_classes, test_classes)) results = utils.cross_validation(descriptors, labels) for k, v in results.items(): print("{} media: {}".format(k, v[-1]))
best_corr_conf = None count = 1 for g in ParameterGrid(hparams): #alg = NNSolver(897, 64 ,1) #1023 alg = lgbmSolver() #alg = Lasso() # Ridge() # print("************" * 5) print(count) count += 1 print("************" * 5) print(g) alg.set_params(**g) r2, mse, corr = cross_validation(num_fold, alg, price_data, **g) if best_r2_res[0] <= r2: best_r2_res = (r2, mse, corr) best_r2_conf = g if best_mse_res[1] >= mse: best_mse_res = (r2, mse, corr) best_mse_conf = g if best_corr_res[2] <= corr: best_corr_res = (r2, mse, corr) best_corr_conf = g print(r2, mse, corr) print("===r2====" * 5) print(best_r2_res) print(best_r2_conf)
import utils, models_preprocessing, metrics import numpy from keras.preprocessing.image import ImageDataGenerator model_function = models_preprocessing.compiledConvnet # model_function = models_preprocessing.compiledRegularizedConvnet auroc = metrics.auroc accuracy = metrics.accuracy text = metrics.basicTextMetrics data = numpy.load('data/source.npy') labels = numpy.load('labels/classification.npy') ## Run 1: Compiled ConvNet # utils.epoch_curve(model_function, data, labels, 0.3, range(1,41), [auroc, accuracy]) ## Run 2: Compiled Regularized ConvNet #generator = ImageDataGenerator(horizontal_flip=True, vertical_flip=True) #utils.epoch_curve_generator(model_function, data, labels, generator, 32, 0.3, range(1, 41), [auroc, accuracy]) ## Run 3: Complied Convnet and Compiled Regularized Convnet with 5-fold cross-validation utils.cross_validation(model_function, data, labels, 5, 20, metrics_array=text)
others = list(set(utils.all_tumor_names) - {cancer_name}) X_others = np.empty((0, X_c.shape[1]), dtype=int) y_others = np.empty(0, dtype=int) for o in others: # print(o) X_o, y_o = utils.get_cancer_data(o) X_others = np.append(X_others, X_o, axis=0) y_others = np.append(y_others, y_o) # Test on cancer ALONE print("\t {} ALONE".format(cancer_name)) cvscores_c, histories_c = utils.cross_validation( X=X_c, y=y_c, preprocess=preprocess, seed=seed, create_model=tumor_alone_model, get_measures=utils.get_measures) utils.report(cvscores_c, writer=writer, sheet_name="{}_alone".format(cancer_name)) # Test on others ALONE print("\t {} ALONE".format("ALL")) cvscores_others, histories_others = utils.cross_validation( X=X_others, y=y_others, preprocess=preprocess, seed=seed, create_model=others_alone_model,
class_weight=None if args.n_classes == 2 else class_weights(train_labels), sample_weight=sample_weights(train_sample, train_labels, args.n_classes, args.weight_type, args.output_dir), batch_size=max(1, n_gpus) * int(args.batch_size)) model.load_weights(weight_file) else: train_labels = [] training = None # RESULTS AND PLOTTING SECTION if args.cross_valid == 'ON': valid_probs = cross_validation(valid_sample, valid_labels, scalars, model, args.output_dir, args.n_folds) print('MERGING ALL FOLDS AND PREDICTING CLASSES ...') if args.cross_valid == 'OFF': print('\nValidation sample', args.n_valid, 'class predictions:') valid_probs = model.predict(valid_sample, batch_size=20000, verbose=args.verbose) print() valid_results(valid_sample, valid_labels, valid_probs, train_labels, training, args.output_dir, args.plotting) if args.n_folds <= 1: print('Saving validation results to', args.output_dir + '/' + 'valid_results.pkl', '\n') valid_sample = {key: valid_sample[key] for key in others} pickle.dump((valid_sample, valid_labels, valid_probs), open(args.output_dir + '/' + 'valid_results.pkl', 'wb'))
def net(input_size): """ A super-simple NN for the single tumor classification """ model = Sequential() model.add(Dense(100, input_shape=(input_size, ), activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(1, activation="sigmoid")) return model cvscoress_all = None out_file = "./results/random_selection.tsv" for i in range(times): rg = get_random_genes(k, n) print("Random {} genes".format(len(rg))) X_c_i = X_c[:, rg] cvscores_c_i, histories_c_i = utils.cross_validation( X=X_c_i, y=y_c, preprocess=preprocess, seed=seed, create_model=net, get_measures=utils.get_measures) cvscores_c_i['experiment'] = i cvscoress_all = pd.concat([cvscoress_all, cvscores_c_i], axis=0) print("Saving file {}".format(out_file)) cvscoress_all.to_csv(out_file, sep="\t", index=False, header=True)
from gensim.models import word2vec import gensim.models as models #model=word2vec.Word2Vec.load("worddict.dic") #r=model.similarity("左","右") #print(r) import blstm import utils utils.cross_validation(blstm.lstm_cross, 5)
if __name__ == '__main__': if build_kernel: methods = ['GP_k3_g1', 'MM_k5_m1', 'WD_d10'] for method in methods: X_train, y_train, X_val, y_val, X_test, K, ID = utils.get_training_datas(method=method, replace=True) # Put replace = False not to erase the previous saves elif check_method: method = 'MM_k6_m1' algo = 'CSVM' solver = 'CVX' data, data1, data2, data3, K, ID = utils.get_all_data([method]) Cs = np.sort([i * 10 ** j for (i, j) in product(range(1, 10), range(-2, 1))]) # Perform cross validation on data set 1 (TF = 1) utils.cross_validation(Ps=Cs, data=data1, algo=algo, solver=solver, kfolds=3, K=K, ID=ID) elif check_alignf: methods = ['MM_k3_m1', 'WD_d5', 'SS_l1_k3'] data, data1, data2, data3, kernels, ID = ALIGNF.aligned_kernels(methods) K = kernels[0] # 0 index for first data set X_train_1, y_train_1, X_val_1, y_val_1, X_test_1, K_1, ID_1 = utils.reformat_data(data1, [K], ID) Cs = np.sort([i * 10 ** j for (i, j) in product(range(1, 10), range(-3, 2))]) utils.cross_validation(Ps=Cs, data=data1, algo='CSVM', kfolds=5, K=K_1[0], ID=ID_1) elif check_NLCK: methods = ['SP_k6', 'SP_k5', 'SP_k4'] data, data1, data2, data3, kernels, ID = utils.get_all_data(methods) X_train_1, y_train_1, X_val_1, y_val_1, X_test_1, kernels_1, ID_1 = utils.reformat_data(data1, kernels, ID) Km1 = NLCKernels.NLCK(X_train_1, y_train_1, ID_1, kernels_1, C=1e-2, eps=1e-9, degree=2).get_K() Cs = np.sort([i * 10 ** j for (i, j) in product(range(1, 10), range(-3, 5))])
# # bovwfile_l2_sqrt = join(output_path, splitext(fname)[0] + EXT_BOVW.format(CURRENT_DAISY, BOVW_L2_sqrt)) # # bovw_l2_sqrt = normalize_L2(normalize_sqrt(bovw)) # # save_data(bovw_l2_sqrt, bovwfile_l2_sqrt) # print('{}'.format(bovwfile)) # stop_time = datetime.now() # time_lapse = stop_time - start_time # print("time lapse on bovw for {} clusters:".format(n_clusters), time_lapse.total_seconds()) # ----------------- # TRAIN CLASSIFIERS # ----------------- # setup training data X_train, y_train = split_into_X_y(train_set) # svm = LinearSVC(C=10.0) # # svm = SVC(C=10.0, gamma=10) # svm.fit(X_train, y_train) # # setup testing data # X_test, y_test = split_into_X_y(test_set) # y_pred = svm.predict(X_test) # tp = np.sum(y_test == y_pred) # print('accuracy = {:.3f}'.format(float(tp) / len(y_test))) from utils import cross_validation print(cross_validation(X_train, y_train))
files_dict=FILES, mat=args.use_mat) # compute gram matrix of ALL dataset K = kernel.compute_gram_matrix(X_train) # Same Gram matrix to be used on different lambdas for _lambda in lambda_list: assert model_name == "SVM_precomputed_gram" clf = SVM_precomputed_gram(_lambda=_lambda, kernel=kernel) # cross validation (default: k=5) results = cross_validation(i, clf, k=args.k_fold, data_dir=DATA_DIR, files_dict=FILES, mat=args.use_mat, K=K) score_train = results["train_avg"] score_val = results["val_avg"] logger.info( f"Accuracy on train set / val set {i} : {round(score_train, 3)} / {round(score_val, 3)}" f"(lambda: {_lambda}, gamma: {gamma}, sigma: {sigma}, window_size: {window_size})" ) if score_val > best_score[i]: best_score[i] = score_val best_lambda[i] = _lambda best_gamma[i] = gamma
import utils import sklearn.metrics import numpy as np import time import cv2 TRAIN_IMAGES_FOLDER = "data/train" TEST_IMAGES_FOLDER = "data/test" IMG_EXTENSION = ".png" uniform = False print("Cargando las imágenes de entrenamiento") train_images, train_classes = utils.load_data(TRAIN_IMAGES_FOLDER, IMG_EXTENSION) print("Cargando las imágenes de test") test_images, test_classes = utils.load_data(TEST_IMAGES_FOLDER, IMG_EXTENSION) print("Calculando los descriptores LBP") train_descriptors = utils.compute_lbp(train_images, uniform = uniform) test_descriptors = utils.compute_lbp(test_images, uniform = uniform) descriptors = np.vstack((train_descriptors, test_descriptors)) labels = np.concatenate((train_classes, test_classes)) results = utils.cross_validation(descriptors, labels, svm_kernel=cv2.ml.SVM_POLY, params={"degree":2}) for k, v in results.items(): print("{} media: {}".format(k, v[-1]))
# In[ ]: conv_idxs = np.append(np.arange(neighbors.shape[0]).reshape(-1, 1), neighbors[:, :n_neighbors], axis=1).flatten() # In[ ]: X_c_conv = X_c[:, conv_idxs] # ## Cross validation # In[ ]: cvscores_c, histories_c = utils.cross_validation(X=X_c_conv, y=y_c, preprocess=preprocess, seed=seed, data_preparation=split_training_default_1, create_model=create_conv_model, get_measures=utils.get_measures) # In[ ]: cvscores_c.to_excel(out_path, index=False)