def run_test(): """ set up a series of CV grid searches """ train_size = 1000 test_size = 1000 #test_types=['USPS'] #test_types=['MNIST'] test_types = ['MNIST', 'USPS'] clf_type = 'kNN' rerun_CV = True #rerun_CV=False # setup grid search parameters # intentionally incomplete and restricted, change as desired num_cv_folds = 10 param_names = ['n_neighbors', 'weights'] param_values = [range(1, 6, 1), ['uniform', 'distance']] param_string_types = [False, True] print('Running', clf_type, 'CV grid search tests...') for test_type in test_types: print('Running CV on dataset', test_type, '...') if test_type == 'MNIST': train, train_label, _, _ = util.MNIST_loader(1, train_size, 1, test_size, echo=False) else: train, train_label, _, _ = util.USPS_loader(1, train_size, 1, test_size, echo=False) for param_name, param_value, param_str_type in zip( param_names, param_values, param_string_types): print('... on parameter', param_name) if rerun_CV: params = {param_name: param_value} np.random.seed(0) # need this, no random_state on CV and kNN # check unlisted default settings vs intended analysis # default n_neighbors=3 for the weights cv clf_cv = GridSearchCV(KNeighborsClassifier( algorithm='ball_tree', n_neighbors=3), param_grid=params, cv=num_cv_folds, verbose=1) util.run_CV(clf_cv, clf_type, test_type, train, train_label, param_name, param_value) # plot from files util.plotterB(str(clf_type + '_grid_search_' + param_name + '_mean_' + test_type + '.csv'), str(clf_type + '_grid_search_' + param_name + '_mean_std_' + test_type + '.csv'), str(param_name + ' (' + test_type + ')'), str('Accuracy (' + test_type + ')'), string=param_str_type)
def run_test(): """ set up a series of CV grid searches """ train_size=1000 test_size=1000 #test_types=['USPS'] #test_types=['MNIST'] test_types=['MNIST','USPS'] clf_type='MLP' rerun_CV=True #rerun_CV=False # setup grid search parameters # intentionally incomplete and restricted, change as desired num_cv_folds=10 param_names=['activation','hidden_layer_sizes'] param_values=[['logistic','tanh','relu'],range(10,60,10),] param_string_types=[True,False] print('Running',clf_type,'CV grid search tests...') for test_type in test_types: print('Running CV on dataset',test_type,'...') if test_type=='MNIST': train,train_label,_,_=util.MNIST_loader(1,train_size,1,test_size,echo=False) else: train,train_label,_,_=util.USPS_loader(1,train_size,1,test_size,echo=False) for param_name,param_value,param_str_type in zip(param_names,param_values,param_string_types): print('... on parameter',param_name) if rerun_CV: params={param_name:param_value} # check unlisted default settings vs intended analysis clf_cv=GridSearchCV(MLPClassifier(hidden_layer_sizes=100,solver='adam',learning_rate='adaptive',random_state=0), param_grid=params,cv=num_cv_folds,verbose=1) util.run_CV(clf_cv,clf_type,test_type,train,train_label,param_name,param_value) # plot from files util.plotterB(str(clf_type+'_grid_search_'+param_name+'_mean_'+test_type+'.csv'), str(clf_type+'_grid_search_'+param_name+'_mean_std_'+test_type+'.csv'), str(param_name+' ('+test_type+')'),str('Accuracy ('+test_type+')'), string=param_str_type)
def run_test(): """ set up a series of CV grid searches """ train_size=1000 test_size=1000 #test_types=['USPS'] #test_types=['MNIST'] test_types=['MNIST','USPS'] clf_type='Decision_Tree' rerun_CV=True #rerun_CV=False # setup grid search parameters # intentionally incomplete and restricted, change as desired num_cv_folds=10 param_names=['max_depth','min_samples_leaf'] param_values=[range(1,6,1),range(1,6,1)] print('Running',clf_type,'CV grid search tests...') for test_type in test_types: print('Running CV on dataset',test_type,'...') if test_type=='MNIST': train,train_label,_,_=util.MNIST_loader(1,train_size,1,test_size,echo=False) else: train,train_label,_,_=util.USPS_loader(1,train_size,1,test_size,echo=False) for param_name,param_value in zip(param_names,param_values): print('... on parameter',param_name) if rerun_CV: params={param_name:param_value} # check unlisted default settings vs intended analysis clf_cv=GridSearchCV(DecisionTreeClassifier(random_state=0,criterion='gini'), param_grid=params,cv=num_cv_folds,verbose=1) #clf_cv=GridSearchCV(DecisionTreeClassifier(random_state=0,criterion='entropy'),param_grid=params,cv=num_cv_folds) util.run_CV(clf_cv,clf_type,test_type,train,train_label,param_name,param_value) # plot from files util.plotterB(str(clf_type+'_grid_search_'+param_name+'_mean_'+test_type+'.csv'), str(clf_type+'_grid_search_'+param_name+'_mean_std_'+test_type+'.csv'), str(param_name+' ('+test_type+')'),str('Accuracy ('+test_type+')'))
def run_test(): """ set up a series of CV grid searches """ # reduced size to shorten execution time, change as desired train_size = 500 test_size = 500 #test_types=['USPS'] #test_types=['MNIST'] test_types = ['MNIST', 'USPS'] clf_type = 'SVM' rerun_CV = True #rerun_CV=False # setup grid search parameters # intentionally incomplete and restricted, change as desired # PCA is to shorten run time, but might not be advisable for analysis # reduced cv to shorten execution time, change as desired num_cv_folds = 5 num_pca = None #num_pca=30 param_names = ['estimator__kernel', 'estimator__C'] param_values = [['linear', 'rbf', 'poly'], np.logspace(0, 5, 10)] param_string_types = [True, False] print('Running', clf_type, 'CV grid search tests...') print('... some settings might take a very long time!') for test_type in test_types: print('Running CV on dataset', test_type, '...') if test_type == 'MNIST': train, train_label, _, _ = util.MNIST_loader(1, train_size, 1, test_size, echo=False) else: train, train_label, _, _ = util.USPS_loader(1, train_size, 1, test_size, echo=False) # some datasets/settings might need PCA to shorten execution print('... running PCA pre-processing') if num_pca is not None: pca = PCA(n_components=num_pca) train = pca.fit_transform(train) for param_name, param_value, param_str_type in zip( param_names, param_values, param_string_types): print('... on parameter', param_name) if rerun_CV: params = {param_name: param_value} # check unlisted default settings vs intended analysis clf_cv = GridSearchCV(OneVsRestClassifier( estimator=SVC(C=1.0, degree=3, random_state=0)), param_grid=params, cv=num_cv_folds, verbose=2) util.run_CV(clf_cv, clf_type, test_type, train, train_label, param_name, param_value) # plot from files util.plotterB(str(clf_type + '_grid_search_' + param_name + '_mean_' + test_type + '.csv'), str(clf_type + '_grid_search_' + param_name + '_mean_std_' + test_type + '.csv'), str(param_name + ' (' + test_type + ')'), str('Accuracy (' + test_type + ')'), string=param_str_type, log_scale=True)
def run_test(test_type='MNIST'): """ run test over different training sizes """ print('Running tests, all classifiers over different training sizes...') # set default (check scikit for defaults) or simple/reasonable classifier settings # replace with tuned settings if desired # note that different tuned settings might be needed for best performance on each dataset clf1=DecisionTreeClassifier(random_state=0) clf2=KNeighborsClassifier(n_neighbors=1) clf3=GradientBoostingClassifier(n_estimators=50,learning_rate=1.0,max_depth=10,random_state=0) clf4=svm.SVC(kernel='linear',C=1.0,random_state=0) clf5=MLPClassifier(hidden_layer_sizes=100,solver='adam',activation='relu', learning_rate='adaptive',random_state=0) clf_list=[clf1,clf2,clf3,clf4,clf5] clf_names=['Decision Tree','kNN','Boosted Trees','SVM','MLP'] if test_type=='MNIST': f_names_train=['MNIST_Decision_Trees_by_training_size_vs_train_data.csv', 'MNIST_kNN_by_training_size_vs_train_data.csv', 'MNIST_Boosted_Trees_by_training_size_vs_train_data.csv', 'MNIST_SVM_by_training_size_vs_train_data.csv', 'MNIST_MLP_by_training_size_vs_train_data.csv'] f_names_val=['MNIST_Decision_Trees_by_training_size_vs_validation_data.csv', 'MNIST_kNN_by_training_size_vs_validation_data.csv', 'MNIST_Boosted_Trees_by_training_size_vs_validation_data.csv', 'MNIST_SVM_by_training_size_vs_validation_data.csv', 'MNIST_MLP_by_training_size_vs_validation_data.csv'] f_names_test=['MNIST_Decision_Trees_by_training_size_vs_test_data.csv', 'MNIST_kNN_by_training_size_vs_test_data.csv', 'MNIST_Boosted_Trees_by_training_size_vs_test_data.csv', 'MNIST_SVM_by_training_size_vs_test_data.csv', 'MNIST_MLP_by_training_size_vs_test_data.csv'] else: f_names_train=['USPS_Decision_Trees_by_training_size_vs_train_data.csv', 'USPS_kNN_by_training_size_vs_train_data.csv', 'USPS_Boosted_Trees_by_training_size_vs_train_data.csv', 'USPS_SVM_by_training_size_vs_train_data.csv', 'USPS_MLP_by_training_size_vs_train_data.csv'] f_names_val=['USPS_Decision_Trees_by_training_size_vs_validation_data.csv', 'USPS_kNN_by_training_size_vs_validation_data.csv', 'USPS_Boosted_Trees_by_training_size_vs_validation_data.csv', 'USPS_SVM_by_training_size_vs_validation_data.csv', 'USPS_MLP_by_training_size_vs_validation_data.csv'] f_names_test=['USPS_Decision_Trees_by_training_size_vs_test_data.csv', 'USPS_kNN_by_training_size_vs_test_data.csv', 'USPS_Boosted_Trees_by_training_size_vs_test_data.csv', 'USPS_SVM_by_training_size_vs_test_data.csv', 'USPS_MLP_by_training_size_vs_test_data.csv'] # define train and test size as desired train_size=1000 test_size=1000 for clf,clf_name,f_name_train,f_name_val,f_name_test in zip(clf_list,clf_names, f_names_train, f_names_val, f_names_test): print('Running',clf_name,'...') err_train_list=[] err_val_list=[] err_test_list=[] # test different values for the training size train_size_list=np.arange(100,1000,100) if test_type=='MNIST': train,train_label,test,test_label=util.MNIST_loader(1,train_size,1,test_size,echo=False) else: train,train_label,test,test_label=util.USPS_loader(1,train_size,1,test_size,echo=False) X_train,X_val,y_train,y_val=train_test_split(train,train_label,test_size=0.2,random_state=0) print('... train and val set size',X_train.shape,X_val.shape) for i in train_size_list: clf.fit(X_train[:i],y_train[:i]) acc_train=clf.score(X_train[:i],y_train[:i]) acc_val=clf.score(X_val,y_val) acc_test=clf.score(test,test_label) err_train_list.append(1.0-acc_train) err_val_list.append(1.0-acc_val) err_test_list.append(1.0-acc_test) #print('... train, val, and test accuracy (error rate) at train size:',acc_train,acc_val,acc_test,i) print('... done, min_train, max_train',np.min(err_train_list),np.max(err_train_list)) print('... done, min_val, max_val',np.min(err_val_list),np.max(err_val_list)) print('... done, min_test, max_test',np.min(err_test_list),np.max(err_test_list)) df_train=pd.DataFrame({'Classifier':[clf_name]*len(err_train_list), 'Size':train_size_list, 'Error':err_train_list}) df_val=pd.DataFrame({'Classifier':[clf_name]*len(err_val_list), 'Size':train_size_list, 'Error':err_val_list}) df_test=pd.DataFrame({'Classifier':[clf_name]*len(err_test_list), 'Size':train_size_list, 'Error':err_test_list}) df_train.to_csv(f_name_train,index=False,header=True) df_val.to_csv(f_name_val,index=False,header=True) df_test.to_csv(f_name_test,index=False,header=True) return f_names_train,f_names_val,f_names_test