def main(argv): __TOTAL_RUNNINGS = 100 __CLASSIFIER = 'knn' #'nbayes' # KNN Parameters K_VALUE = 3 KNN_DEBUG = False pool = False cv_type = 'kcv' #cv_type = '' # Seeds test USE_KNOWN_GOOD_SLICE_GROUPING = True # Use this arguments to set the input directory of attributes files __USE_SAMPLE_DATA_DIR = True __SAMPLE_DATA_DIR = "../../attributes_amostra" __FULL_DATA_DIR = "../../attributes2" attributes_dir = __FULL_DATA_DIR csv_file = './ADNI1_Complete_All_Yr_3T.csv' if __USE_SAMPLE_DATA_DIR: attributes_dir = __SAMPLE_DATA_DIR # Getting all data start_time = time.time() print('Loading all atributes data... ', end='') attribs, body_planes, slice_num, slice_amounts, output_classes = loadattribs.load_all_data( attributes_dir, csv_file) end_time = time.time() total_time = end_time - start_time print('done (total time to load: {0})'.format(total_time)) import deap_alzheimer min_slices_values = loadattribs.getSliceLimits(slice_amounts)[0] valid_bplanes = loadattribs.getBplanes(slice_amounts) print('Slice Limits:', min_slices_values) print('valid_bplanes=', valid_bplanes) if USE_KNOWN_GOOD_SLICE_GROUPING: print('\n* Using a specific known good slice grouping... ', end='') bplane, start_slice, total_slices = [2, 120, 20] else: print('\n* Building a random valid slice grouping... ', end='') bplane, start_slice, total_slices = deap_alzheimer.buildRandomSliceGrouping( planes=valid_bplanes, length=int(random.random() * 20), max_indexes=min_slices_values, dbug=False) print('\nDone! Slice grouping: [{0}, {1}, {2}]'.format( bplane, start_slice, total_slices)) start_time = time.time() # Getting some data partition print( '\n* Getting the specific data partition using this slice grouping {0}... ' .format([bplane, start_slice, total_slices]), end='') data_partition = loadattribs.getAttribsPartitionFromSingleSlicesGrouping( attribs, slice_amounts, bplane, start_slice, total_slices) end_time = time.time() total_time = end_time - start_time print('done.\n\tTotal time to get the data partition= {0}'.format( total_time, )) print('\n* Data Partition\'s shape= ', data_partition.shape) start_time = time.time() print( '\n* Starting to run knn classifier {0} times to evaluate this data partition...' .format(__TOTAL_RUNNINGS)) all_acc = [] all_cmat = [] for r in list(range(__TOTAL_RUNNINGS)): accuracy, conf_matrix = runKNN(data_partition, output_classes, __CLASSIFIER, K_VALUE, knn_debug=KNN_DEBUG, use_smote=True, use_rescaling=True, cv_type=cv_type, use_Pool=pool) all_acc.append(accuracy) all_cmat.append(conf_matrix) end_time = time.time() total_time = end_time - start_time print('done.') all_acc = np.array(all_acc) best_acc = all_acc.max() best_acc_pos = all_acc.argmax() best_acc_cmat = all_cmat[best_acc_pos] worst_acc_cmat = all_cmat[all_acc.argmin()] print('\n* Results afer {1} runnings:\n{0}'.format(all_acc, __TOTAL_RUNNINGS)) print('\ttime to run classifier={0}'.format(total_time)) print('\tclassifier={0}'.format(__CLASSIFIER)) print('\tmean={0}'.format(np.mean(all_acc))) print('\tvariance={0}'.format(all_acc.var())) print('\tstd={0}'.format(all_acc.std())) print('\tmax={0}'.format(best_acc)) print('\tmin={0}'.format(all_acc.min())) print('\tConfusion matrix of the best result:\n', best_acc_cmat) print('\tConfusion matrix of the worst result:\n', worst_acc_cmat) return 0
def main(argv): # runtime parameters __TOTAL_RUNNINGS = 1 __MULTIPROCESS = False __USE_SAMPLE_DATA_DIR = True # Use this arguments to set the input directory of attributes files USE_FIXED_SLICE_GROUPING = True # Seeds test __VERBOSE = True # Models Parameters __USE_PCA = True __USE_STRATIFIED_KFOLD = True knn_k_value = 3 lr_solver = 'sag' lr_multiclass = 'ovr' kcv_folds = 11 import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=UserWarning) FIXED_SLICE_GROUPING = [2, 105, 8] # Use this arguments to set the input directory of attributes files __SAMPLE_DATA_DIR = "../../attributes_amostra" __FULL_DATA_DIR = "../../attributes2" attributes_dir = __FULL_DATA_DIR csv_file = './ADNI1_Complete_All_Yr_3T.csv' if __USE_SAMPLE_DATA_DIR: attributes_dir = __SAMPLE_DATA_DIR # Getting all data start_time = time.time() print('Loading all atributes data... ', end='') attribs, body_planes, slice_num, slice_amounts, output_classes, all_genders, all_ages, demographics_dic = loadattribs.load_all_data( attributes_dir, csv_file) end_time = time.time() total_time = end_time - start_time print('done (total time to load: {0:.2f}s)'.format(total_time)) import deap_alzheimer min_slices_values = loadattribs.getSliceLimits(slice_amounts)[0] valid_bplanes = loadattribs.getBplanes(slice_amounts) #print('Slice Limits:',min_slices_values) #print('valid_bplanes=',valid_bplanes) if USE_FIXED_SLICE_GROUPING: #print('* Using a specific known good slice grouping... ', end='') bplane, start_slice, total_slices = FIXED_SLICE_GROUPING else: #print('* Building a random valid slice grouping... ', end='') bplane, start_slice, total_slices = deap_alzheimer.buildRandomSliceGrouping( planes=valid_bplanes, length=int(random.random() * 20), max_indexes=min_slices_values, dbug=False) #print('Done!\n* Slice grouping created: [{0}, {1}, {2}]'.format(bplane,start_slice,total_slices)) #start_time = time.time() # Getting some data partition #print('* Getting the specific data partition using this slice grouping {0}... '.format([bplane,start_slice,total_slices]),end='') data_partition = loadattribs.getAttribsPartitionFromSingleSlicesGrouping( attribs, slice_amounts, bplane, start_slice, total_slices) #end_time = time.time() #total_time = end_time - start_time #print('done.\n* Total time to get the data partition= {0}'.format(total_time,)) # Preparing data to use with PANDAS # Data preparation #print('* Current Data Partition\'s shape= ',data_partition.shape) try: new_dimensions = (data_partition.shape[0], data_partition.shape[1] * data_partition.shape[2]) except IndexError: print( '** IndexValue exception: data_partition.shape={0} output_classes.shape={1}' .format(data_partition.shape, output_classes.shape)) sys.exit(-1) # Reshapping X_data X_reshaped = np.reshape(data_partition, new_dimensions) #print('* New Data Partition\'s shape= ',X_reshaped.shape) #print('* New dimensions (must be equal to X_reshaped.shape): ',new_dimensions) X_pandas = pd.DataFrame(data=X_reshaped) y_pandas = pd.DataFrame(data=output_classes) # Getting models list models_names = [] models_names.append('KNN') # models_names.append('LDA') # models_names.append('CART') # models_names.append('NB') # models_names.append('SVM') # models_names.append('RF') # models_names.append('LR') #build_models_list(knn_k_value,lr_solver,lr_multiclass) # All results pool all_models_results = [] # # Initializing pool of results # for model_name in models_names: # model_result = [] # metrics_values = all_metrics_values() # for metric in metrics_values: # model_result.append([]) # all_models_results.append(model_result) # all_mean_acc = [] # all_median_acc = [] # all_median_cmat = [] # all_time = [] for n in list(range(__TOTAL_RUNNINGS)): ## all_experiments_results is a LIST of LISTS of Dictionaries which have as keys: ############################################################################################ # n_experiment_results = evaluate_all(X_pandas, y_pandas, # knn_k_value,lr_solver,lr_multiclass, # kcv_folds=kcv_folds, # use_multiprocess=__MULTIPROCESS) ############################################################################################ cv_seed = 7 cv_shuffle = True use_multiprocess = __MULTIPROCESS use_smote = True use_rescaling = True # Getting how many cpus are avaliable cores_num = multiprocessing.cpu_count() # used by n_jobs # Validation setup cv = model_selection.KFold(n_splits=kcv_folds, random_state=cv_seed, shuffle=cv_shuffle) both_indexes = cv.split(X_pandas) # Current Experiment: Results from each model experiment_results = [] if use_multiprocess and __name__ == "__main__": with Pool(cores_num) as p: from functools import partial experiment_results = p.map( partial(evaluate_model_using_smote_and_rescaling, all_train_and_test_indexes=both_indexes, X_data=X_pandas, y_data=y_pandas, folds=kcv_folds, smote=use_smote, rescaling=use_rescaling, cores_num=cores_num, maximization=True, pca=__USE_PCA, stratified_kfold=__USE_STRATIFIED_KFOLD), models_names) else: for model in models_names: #print('* Evaluation {0} model...'.format(model[0])) model_results = evaluate_model_using_smote_and_rescaling( both_indexes, X_pandas, y_pandas, model, kcv_folds, smote=use_smote, rescaling=use_rescaling, cores_num=cores_num, stratified_kfold=__USE_STRATIFIED_KFOLD, pca=__USE_PCA) experiment_results.append(model_results) all_models_results.append(experiment_results) #print (all_results['KNN']) ######################################################## # Compiling result data from this experiment # model_mean_acc = [] # model_median_acc = [] # model_median_cmat = [] # model_time = [] # # for model_results in n_experiment_results: # # 'model_results' is a dicionary # # model_mean_acc.append(model_results['mean_acc']) # model_median_acc.append(model_results['median_acc']) # model_median_cmat.append(model_results['median_cmat']) # model_time.append(model_results['total_time']) # # np_mean_acc = np.array(model_mean_acc) # np_time = np.array(model_time) # # all_mean_acc.append(np_mean_acc.mean()) # all_median_acc.append(model_median_acc[len(model_median_acc)//2]) # all_median_cmat.append(model_median_cmat[len(model_median_cmat)//2]) # all_time.append(np_time.mean) # # models = build_models_list() # # # Printing all compliled data # print('* All {0:03d} experiments results:'.format(__TOTAL_RUNNINGS)) # # # # for model in range(len(models)): # name = models[model][0] # print('model {0}:'.format(name),end='') ## #print('all_mean_acc[0].__class__.__name__=',all_mean_acc[0].__class__.__name__) # # #np_all_acc_mean = np.array(all_mean_acc[model]) # #print(' acc_mean=',np_all_acc_mean,end='') # # print(' acc_median={0}'.format(all_median_acc[model]),end='') # # # print('\n') # mean_acc = np_all_mean.mean() # std_acc = np_all_mean.std() # max_acc = np.argmax(np_all_mean) # min_acc = np.argmin(np_all_mean) # # median_pos = kcv_folds // 2 if kcv_folds % 2 == 1 else None # median_acc = all_median_acc[exp][median_pos] if kcv_folds % 2 == 1 else None # median_cmat = (all_median_cmat[exp][median_pos] if kcv_folds % 2 == 1 else None) # # np_all_time = np.array(all_time[exp]) # total_time = np_all_time.mean() # # print('{0}:\tmean_acc={1:.4f} mean_std={2:.4f} median_acc={3:.4f} median_cmat={4} max_acc{5:.4f} min_acc={5:.4f} total_time={6:.4f}s'.format(name, mean_acc, std_acc, median_acc, str(median_cmat), max_acc, min_acc, total_time)) return 0
def main(argv): # KNN Parameters K_VALUE = 5 # Seeds test USE_KNOWN_GOOD_SLICE_GROUPING = True # Use this arguments to set the input directory of attributes files __USE_SAMPLE_DATA_DIR = False __SAMPLE_DATA_DIR = "../../attributes_amostra" __FULL_DATA_DIR = "../../attributes2" attributes_dir = __FULL_DATA_DIR csv_file = './ADNI1_Complete_All_Yr_3T.csv' if __USE_SAMPLE_DATA_DIR: attributes_dir = __SAMPLE_DATA_DIR # Getting all data start_time = time.time() print('Loading all atributes data... ', end='') attribs, body_planes, slice_num, slice_amounts, output_classes = loadattribs.load_all_data( attributes_dir, csv_file) end_time = time.time() total_time = end_time - start_time print('done (total time to load: {0})'.format(total_time)) import deap_alzheimer min_slices_values = loadattribs.getSliceLimits(slice_amounts)[0] valid_bplanes = loadattribs.getBplanes(slice_amounts) print('Slice Limits:', min_slices_values) print('valid_bplanes=', valid_bplanes) # def getRandomSliceGrouping(all_slice_amounts,best of mozart # planes = __DEFAULT_BPLANES, # max_length = __DEFAULT_MAX_CONSEC_SLICES, # max_indexes = __DEFAULT_MAX_SLICES_VALUES, # Maximum value for the first slice index # dbug=__DEFAULT_DEBUG): if USE_KNOWN_GOOD_SLICE_GROUPING: print('* Using a specific known good slice grouping... ', end='') bplane, start_slice, total_slices = [2, 114, 15] else: print('* Building a random valid slice grouping... ', end='') bplane, start_slice, total_slices = deap_alzheimer.buildRandomSliceGrouping( planes=valid_bplanes, length=30, max_indexes=min_slices_values, dbug=False) print('done. Slice grouping: [{0}, {1}, {2}]'.format( bplane, start_slice, total_slices)) start_time = time.time() # Getting some data partition print( '* Getting a random data partition using this slice grouping {0}... '. format([bplane, start_slice, total_slices]), end='') data_partition = loadattribs.getAttribsPartitionFromSingleSlicesGrouping( attribs, slice_amounts, bplane, start_slice, total_slices) end_time = time.time() total_time = end_time - start_time print('done.\n\tTotal time to get the data partition= {0}'.format( total_time, )) print('* Data Partition\'s shape= ', data_partition.shape) start_time = time.time() print( '* Starting to run knn classifier to evaluate this data partition...') accuracy, conf_matrix = runKNN(data_partition, output_classes, K_VALUE, knn_debug=True, use_smote=True, use_rescaling=True) end_time = time.time() total_time = end_time - start_time print('done. Total time to run classifier= {0}'.format(total_time)) print('\n* Confusion matrix was:\n', conf_matrix) print('* KNN Acurracy with K={0} was: {1}'.format(K_VALUE, accuracy)) return 0
def main(argv): __USE_DATA_SAMPLE = True # KNN Parameters K_VALUE = 5 # Use this arguments to set the input directory of attributes files attributes_dir = "../../attributes_amostra" if not __USE_DATA_SAMPLE: attributes_dir = "../../attributes2" csv_file = './ADNI1_Complete_All_Yr_3T.csv' # Getting all data start_time = time.time() print('Loading all atributes data...') attribs, body_planes, slice_num, slice_amounts, output_classes = loadattribs.load_all_data(attributes_dir, csv_file) end_time = time.time() total_time = end_time - start_time print('...done (total time to load: {0})'.format(total_time)) import deap_alzheimer min_slices_values = loadattribs.getSliceLimits(slice_amounts)[0] valid_bplanes = loadattribs.getBplanes(slice_amounts) print('Slice Limits:',min_slices_values) print('valid_bplanes=',valid_bplanes) # def getRandomSliceGrouping(all_slice_amounts,best of mozart # planes = __DEFAULT_BPLANES, # max_length = __DEFAULT_MAX_CONSEC_SLICES, # max_indexes = __DEFAULT_MAX_SLICES_VALUES, # Maximum value for the first slice index # dbug=__DEFAULT_DEBUG): print('Getting a random valid slice grouping...') bplane, start_slice, total_slices = deap_alzheimer.buildRandomSliceGrouping( planes=valid_bplanes, length = 30, max_indexes = min_slices_values, dbug=False) print('...done') print('slice grouping found:\n\tbplane={0},first_slice={1},total_slices={2}'.format(bplane,start_slice,total_slices)) print('Individual analysed: [{0}, {1}, {2}]'.format(bplane,start_slice,total_slices)) start_time = time.time() # Getting some data partition print('Getting some data partition using this last slice grouping ({0})...'.format((bplane,start_slice,total_slices))) data_partition = loadattribs.getAttribsPartitionFromSingleSlicesGrouping(attribs, slice_amounts, bplane, start_slice, total_slices) end_time = time.time() total_time = end_time - start_time print('...done \nTotal time to get the data partition (bplane={1},first_slice={2},total_slices={3}): {0}'.format(total_time,bplane,start_slice,total_slices)) start_time = time.time() print('Starting to run knn classifier to evaluate this partition of data...') accuracy, conf_matrix = knn_alzheimer.runKNN(data_partition, output_classes, K_VALUE,use_smote=False,use_rescaling=False) end_time = time.time() total_time = end_time - start_time print('...done (total time to run classifier: {0})'.format(total_time)) print('Individual analysed: [{0}, {1}, {2}]'.format(bplane,start_slice,total_slices)) print('\nConfusion matrix was:\n', conf_matrix) print ('KNN Acurracy with K={0} was: {1}'.format(K_VALUE, accuracy)) ########################################################################### K_VALUE = 5 # SMOTE BEGINS HERE smote_debug = True print('* a partition data shape=',data_partition.shape) # Data preparation try: # Transforming 3D data to a 2D data new_dimensions = (data_partition.shape[0], data_partition.shape[1]*data_partition.shape[2]) except IndexError: print('** IndexValue exception') print('\tdata_partition.shape=',data_partition.shape) print('\output_classes.shape=',output_classes.shape) print('\t') sys.exit(-1) new_partition = np.reshape(data_partition, new_dimensions) #scaled_new_partition = preprocessing.scale(new_partition) used_partition = new_partition for i in range(2): if i == 1: print('\n*** NOW we will do the same however using scaled and balanced data:') #used_partition = scaled_new_partition if smote_debug and False: print('* DIMENSION for the new partition array=',new_dimensions) print('* the new partition data shape=',used_partition.shape) print('* the output array shape=',output_classes.shape) print('* shape of an input instance retrived from the new partition=', used_partition[0].shape) ## KNN preparation X_pandas = pd.DataFrame(data=used_partition) #print('X_pandas=\n',X_pandas) #_pandas = pd.DataFrame(data=np.ravel(output_classes,order='C')) y_pandas = pd.DataFrame(data=output_classes) y_pandas.columns = ['Class'] #print('y_pandas=\n',y_pandas) #print('y_pandas values (without balancing)=\n',pd.value_counts(y_pandas['Class'])) if i == 1: # STANDARDIZING... # Get column names first #names = ['Class'] # Create the Scaler object scaler = preprocessing.StandardScaler() # Fit your data on the scaler object X_pandas = scaler.fit_transform(X_pandas) #X_pandas = pd.DataFrame(scaled_df, columns=names) # STEP 1: split data between test and train sets X_train, X_test, y_train, y_test = train_test_split(X_pandas, np.ravel(y_pandas), test_size=0.3, random_state=12) #import matplotlib.pyplot as plt if i == 0: print('classes count(before SMOTE)= ',(sum(y_train==0),sum(y_train==1),sum(y_train==2))) #pd.value_counts(y_pandas['Class']).plot.bar() #plt.title('Unbalanced Alzheimer class histogram') #plt.xlabel('Class') #plt.ylabel('Frequency') elif i == 1: #pd.value_counts(y_pandas['Class']).plot.bar() #plt.title('Balanced and Normalized Alzheimer class histogram') #plt.xlabel('Class') #plt.ylabel('Frequency') from imblearn.over_sampling import SMOTE smt = SMOTE() X_train, y_train = smt.fit_sample(X_train, y_train) #print('classes count(after SMOTE)=\n',np.bincount(y_train)) print('classes count(after SMOTE)=',(sum(y_train==0),sum(y_train==1),sum(y_train==2))) # STEP 2: train the model on the training set knn = KNeighborsClassifier(n_neighbors=K_VALUE) knn.fit(X_train, y_train) # STEP 3: make predictions on the testing set y_pred = knn.predict(X_test) #if smote_debug: #print('y_pred=\n',y_pred) #print('y_pred.shape:',y_pred.shape) # compare actual response values (y_test) with predicted response values (y_pred) accuracy = metrics.accuracy_score(y_test, y_pred) confusion_matrix = metrics.confusion_matrix(y_test,y_pred,labels=None,sample_weight=None) print('Individual analysed: [{0}, {1}, {2}]'.format(bplane,start_slice,total_slices)) print ('KNN Acurracy with K={0} was: {1}'.format(K_VALUE, accuracy)) print('confusion matrix:\n',confusion_matrix) ''' # STEP 1: split data between test and train sets X_train, X_test, y_train, y_test = train_test_split(X_pandas, y_pandas, test_size=0.3, random_state=12) # print the shapes of the new X objects if smote_debug: print('X_train.shape:', X_train.shape) print('X_test.shape:', X_test.shape) y_train = np.ravel(y_train) y_test = np.ravel(y_test) # print the shapes of the new y objects if smote_debug: print('y_train.shape:',y_train.shape) print('y_test.shape:',y_test.shape) # STEP 1: adjust shape of y vectors np.ravel(y_train) # STEP 2: train the model on the training set knn = KNeighborsClassifier(n_neighbors=k_value) knn.fit(X_train, y_train) # STEP 3: make predictions on the testing set y_pred = knn.predict(X_test) if smote_debug: print('y_pred=\n',y_pred) print('y_pred.shape:',y_pred.shape) # compare actual response values (y_test) with predicted response values (y_pred) accuracy = metrics.accuracy_score(y_test, y_pred) confusion_matrix = metrics.confusion_matrix(y_test,y_pred,labels=None,sample_weight=None) ''' return 0