def main(): description = "An integrated sklearn API to run training and prediction. Simple example: ./train_pred.py -i TRAIN_INPUT -t TEST_INPUT -m svm -o PREDICT" parser = load_parser(description) parser.add_argument('-t' , '--test' , dest='test_filename' , required=True , help='Specify the test file path') parser.add_argument('-o' , '--output' , dest='output_filename' , help='Specify the output predict file path [optional]') parser.add_argument('-om', '--output-model' , dest='model_filename' , help='Specify the output model file path [optional]') parser.add_argument('-op', '--output-prob' , dest='prob_filename' , help='Specify the output probability file path [optional]') opts = parser.parse_args(sys.argv[1:]) # pre-check options before loading data opts.model = opts.model.upper() opts.kernel = opts.kernel.lower() opts.base_estimator = opts.base_estimator.upper() check_options(opts) # Loading training data print "Loading %s ..." %opts.train_filename x_train, y_train = load_svmlight_file(opts.train_filename) x_train = x_train.todense() (N, D) = x_train.shape print "training data dimension = (%d, %d)" %(N, D) # Loading testing data print "Loading %s ..." %opts.test_filename x_test, y_test = load_svmlight_file(opts.test_filename) x_test = x_test.todense() (N, D) = x_test.shape print "testing data dimension = (%d, %d)" %(N, D) # feature normalization if( opts.normalized ): if( opts.normalized == 1 ): scaler_filename = opts.train_filename + '.scaler-11.pkl' elif( opts.normalized == 2 ): scaler_filename = opts.train_filename + '.scaler-01.pkl' elif( opts.normalized == 3 ): scaler_filename = opts.train_filename + '.scaler-std.pkl' else: print "Error! Unknown normalization method (%d)!" %opts.normalized print "Choice: 1 for [-1, 1], 2 for [0, 1], 3 for standard normalization" traceback.print_stack() sys.exit(1) scaler = load_scaler(scaler_filename, x_train, opts.normalized) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # dimension selection if( opts.dim != None ): opts.dim = int(opts.dim) if( opts.dim >= D ): print "Warning! Select dimension (%d) >= max data dimension (%d), use original dimension." %(opts.dim, D) opts.dim = D else: x_train = x_train[:, :opts.dim] x_test = x_test[:, :opts.dim] (N, D) = x_train.shape print "Using first %d feature ..." %(opts.dim) if( opts.prob_filename != None ): outputProb = True else: outputProb = False # Train and predict print "Training and Predicting ..." if opts.model == 'SVM': arg = {'kernel': opts.kernel, 'probability': outputProb} if( opts.C == None ): arg['C'] = 1.0 else: arg['C'] = float(opts.C) if( opts.gamma == None ): arg['gamma'] = 1.0/D else: arg['gamma'] = float(opts.gamma) ############################################################ ## RBF-SVM ## ############################################################ if( opts.kernel == 'rbf' ): print 'Run %s-SVM with C = %f, gamma = %f' %(opts.kernel, arg['C'], arg['gamma']) clf = train(SVC, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print 'acc = %f' % acc ############################################################ ## polynomial-SVM ## ############################################################ elif( opts.kernel == 'poly' ): if( opts.coef0 == None ): arg['coef0'] = 0 else: arg['coef0'] = float(opts.coef0) if( opts.degree == None ): arg['degree'] = 3 else: arg['degree'] = int(opts.degree) print 'Run %s-SVM with C = %f, coef0 = %f, gamma = %f, degree = %d' %(opts.kernel, arg['C'], arg['coef0'], arg['gamma'], arg['degree']) clf = train(SVC, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print 'acc = %f' % acc ############################################################ ## sigmoid-SVM ## ############################################################ elif( opts.kernel == 'sigmoid' ): if( opts.coef0 == None ): arg['coef0'] = 0 else: arg['coef0'] = float(opts.coef0) print 'Run %s-SVM with C = %f, coef0 = %f, gamma = %f' %(opts.kernel, arg['C'], arg['coef0'], arg['gamma']) clf = train(SVC, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print 'acc = %f' % acc else: print "Error! Unknown kernel %s!" %opts.kernel traceback.print_stack() sys.exit(1) ############################################################ ## linear-SVM ## ############################################################ elif opts.model == 'LINEARSVM': if( outputProb == True ): print "Warning! Probability output is not supported in LinearSVM!" outputProb = False arg = {} if( opts.penalty == None ): arg['penalty'] = 'l2' else: arg['penalty'] = opts.penalty if( opts.penalty == 'l1' ): arg['dual'] = False if( opts.loss == None ): arg['loss'] = 'l2' else: arg['loss'] = opts.loss if( opts.C == None ): # run all C arg['C'] = 1.0 / D else: arg['C'] = float(opts.C) print 'Run Linear_SVM with C = %f, penalty = %s, loss = %s' %(arg['C'], arg['penalty'], arg['loss']) clf = train(LinearSVC, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename) print "acc = %f" %acc ############################################################ ## Linear model with SGD ## ############################################################ elif opts.model == 'SGD': if( outputProb == True ): print "Warning! Probability output is not supported in SGD!" outputProb = False arg = {} if( opts.penalty == None ): arg['penalty'] = 'l2' else: arg['penalty'] = opts.penalty if( opts.loss == None ): arg['loss'] = 'hinge' else: arg['loss'] = opts.loss if( opts.alpha == None ): arg['alpha'] = 0.0001 else: arg['alpha'] = float(opts.alpha) print 'Run Linear-SVM with alpha = %f, penalty = %s, loss = %s' %(arg['alpha'], arg['penalty'], arg['loss']) clf = train(SGDClassifier, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename) print "acc = %f" %acc ############################################################ ## Random Forest ## ############################################################ elif opts.model == 'RF': arg = {} if( opts.n_estimators == None ): arg['n_estimators'] = 100 else: arg['n_estimators'] = int(opts.n_estimators) print 'Run RandomForest with n_estimators = %d' %(arg['n_estimators']) clf = train(RandomForestClassifier, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print "acc = %f" %acc ############################################################ ## AdaBoost ## ############################################################ elif opts.model == 'ADABOOST': arg = {} be_DT = DecisionTreeClassifier() be_SVC = SVC(probability=True) be_SGD_huber = SGDClassifier(loss='modified_huber') be_SGD_log = SGDClassifier(loss='log') if( opts.base_estimator == None or opts.base_estimator == 'DT' ): be = [ be_DT ] elif( opts.base_estimator == 'SVM' ): be = [ be_SVC ] elif( opts.base_estimator == 'SGD' or opts.base_estimator == 'SGD-HUBER' ): be = [ be_SGD_huber ] elif( opts.base_estimator == 'SGD-LOG' ): be = [ be_SGD_log ] else: print "Unkinown base estimator %s !" %opts.base_estimator traceback.print_stack() sys.exit(1) if( opts.n_estimators == None ): arg['n_estimators'] = 100 else: arg['n_estimators'] = int(opts.n_estimators) if( opts.learning_rate == None ): arg['learning_rate'] = 1.0 else: arg['learning_rate'] = float(opts.learning_rate) print 'Run AdaBoost with n_estimators = %d, learning_rate = %f' %(arg['n_estimators'], arg['learning_rate']) clf = train(AdaBoostClassifier, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print "acc = %f" %acc ############################################################ ## GradientBoost ## ############################################################ elif opts.model == 'GB': arg = {} if( opts.n_estimators == None ): arg['n_estimators'] = 100 else: arg['n_estimators'] = int(opts.n_estimators) if( opts.learning_rate == None ): arg['learning_rate'] = 0.1 else: arg['learning_rate'] = float(opts.learning_rate) print 'Run GradientBoosting with n_estimators = %d, learning_rate = %f' %(arg['n_estimators'], arg['learning_rate']) clf = train(GradientBoostingClassifier, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print "acc = %f" %acc ############################################################ ## KNN ## ############################################################ elif opts.model == 'KNN': arg = {} if( opts.n_neighbors == None ): arg['n_neighbors'] = 5 else: arg['n_neighbors'] = int(opts.n_neighbors) if( opts.degree == None ): arg['p'] = 2 else: arg['p'] = int(opts.degree) if( opts.weights == None ): arg['weights'] = 'distance' else: arg['weights'] = opts.weights print 'Run KNN with n_neighbors = %d, weights = %s, power of distance metric = %d' %(arg['n_neighbors'], arg['weights'], arg['p']) clf = train(KNeighborsClassifier, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print "acc = %f" %acc ############################################################ ## Logistic Regression ## ############################################################ elif opts.model == 'LR': arg = {} if( opts.penalty == None ): arg['penalty'] = 'l2' else: arg['penalty'] = opts.penalty if( opts.C == None ): # run all C arg['C'] = 1.0 else: arg['C'] = float(opts.C) if(arg['penalty'] == 'l2'): arg['dual'] = True print 'Run Logistic Regression with C = %f, penalty = %s' %(arg['C'], arg['penalty']) clf = train(LogisticRegression, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print "acc = %f" %acc ############################################################ ## Ridge Regression ## ############################################################ elif opts.model == 'RIDGE': arg = {} if( opts.alpha == None ): arg['alpha'] = 1.0 else: arg['alpha'] = float(opts.alpha) print 'Run Ridge Regression with alpha = %f' %(arg['alpha']) clf = train(RidgeClassifier, arg, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename, opts.prob_filename) print "acc = %f" %acc ############################################################ ## Gaussian Naive Bayes ## ############################################################ elif opts.model == 'GNB': print 'Run Gaussian Naive Bayes' clf = train(GaussianNB, {}, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename) print 'acc = %f' % acc ############################################################ ## Linear Discriminant Analysis ## ############################################################ elif opts.model == 'LDA': print 'Run Linear Discriminant Analysis' clf = train(LDA, {}, x_train, y_train, opts.model_filename) acc = predict(clf, x_test, y_test, opts.output_filename) print 'acc = %f' % acc else: sys.stderr.write('Error: invalid model %s\n' %opts.model) traceback.print_stack() sys.exit(1)
def main(): description='An integrated sklearn API to run N-fold training and cross validation with multi-thread.Simple example: ./train_valid.py -i INPUT -m svm' parser = load_parser(description) parser.add_argument('-f' , '--fold' , dest='fold' , type=int, default=3, help='Number of fold in cross_validation [default = 3]') parser.add_argument('-th', '--thread', dest='thread', type=int, default=8, help='Number of thread to run in parallel [default = 8]') parser.add_argument('-log2c' , dest='log2_C' , help='Grid search {begin:end:step} for log2(C)') parser.add_argument('-log2g' , dest='log2_gamma' , help='Grid search {begin:end:step} for log2(gamma)') parser.add_argument('-log2r' , dest='log2_coef0' , help='Grid search {begin:end:step} for log2(coef0)') parser.add_argument('-log2lr' , dest='log2_lr' , help='Grid search {begin:end:step} for log2(learning_rate)') parser.add_argument('-log2a' , dest='log2_alpha' , help='Grid search {begin:end:step} for log2(alpha)') opts = parser.parse_args(sys.argv[1:]) # pre-check options before loading data opts.model = opts.model.upper() opts.kernel = opts.kernel.lower() check_options(opts) # Loading training data print "Loading %s ..." %opts.train_filename x_train, y_train = load_svmlight_file(opts.train_filename) x_train = x_train.todense() (N, D) = x_train.shape print "training data dimension = (%d, %d)" %(N, D) # feature normalization if( opts.normalized ): if( opts.normalized == 1 ): scaler_filename = opts.train_filename + '.scaler-11.pkl' elif( opts.normalized == 2 ): scaler_filename = opts.train_filename + '.scaler-01.pkl' elif( opts.normalized == 3 ): scaler_filename = opts.train_filename + '.scaler-std.pkl' else: print "Error! Unknown normalization method (%d)!" %opts.normalized print "Choice: 1 for [-1, 1], 2 for [0, 1], 3 for standard normalization" traceback.print_stack() sys.exit(1) scaler = load_scaler(scaler_filename, x_train, opts.normalized) x_train = scaler.transform(x_train) # dimension grid search if( opts.dim == None ): dim_list = [D] else: dim_list = parse_grid(opts.dim, 0, 100) x_train_all = x_train for dim in dim_list: if( dim > D ): print "Warning! Select dimension (%d) >= max data dimension (%d), use original dimension." %(dim, D) dim = D else: x_train = x_train_all[:, :dim] print "Using first %d feature ..." %(dim) # Training and Validation if opts.model == 'SVM': # parameter C if( opts.C != None ): c_list = parse_grid(opts.C, 0, float) else: if( opts.log2_C != None ): c_list = parse_grid(opts.log2_C, 2) # base = 2 else: # default = {1, 2, 4, 8, 16, 32, 64, 128} c_list = [] for i in range(0, 8): c_list.append( 2**i ) # parameter gamma if( opts.gamma != None ): gamma_list = parse_grid(opts.gamma, 0, float) else: if( opts.log2_gamma != None ): gamma_list = parse_grid(opts.log2_gamma, 2) # base = 2 else: # default = {0.0625, 0.25, 1, 4} gamma_list = [] for i in range(-4, 5, 2): gamma_list.append( 2**i ) ############################################################ ## RBF-SVM ## ############################################################ if( opts.kernel == 'rbf' ): arg_list = list( ParameterGrid( {'kernel': [opts.kernel], 'gamma': gamma_list, 'C': c_list} ) ) (acc_max, arg_best) = parallel_cross_validation(SVC, 'SVM', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- C = %f, gamma = %f" %(acc_max, arg_best['C'], arg_best['gamma']) print "#####################################################################################" ############################################################ ## polynomial-SVM ## ############################################################ elif( opts.kernel == 'poly' ): if( opts.coef0 != None ): coef0_list = parse_grid(opts.coef0, 0, float) else: if( opts.log2_coef0 != None ): coef0_list = parse_grid(opts.log2_coef0, 2) # base = 2 else: # default = {0.0625, 0.25, 1, 4} coef0_list = [] for i in range(-4, 5, 2): coef0_list.append( 2**i ) if( opts.degree != None ): degree_list = parse_grid(opts.degree, 0) else: # default = {1, 2, 3, 4} degree_list = [] for i in range(1, 5): degree_list.append(i) arg_list = list( ParameterGrid( {'kernel':[opts.kernel], 'degree': degree_list, 'coef0': coef0_list, 'gamma': gamma_list, 'C': c_list} ) ) (acc_max, arg_best) = parallel_cross_validation(SVC, 'SVM', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- C = %f, coef0 = %f, gamma = %f, degree = %d" %(acc_max, arg_best['C'], arg_best['coef0'], arg_best['gamma'], arg_best['degree']) print "#####################################################################################" ############################################################ ## sigmoid-SVM ## ############################################################ elif( opts.kernel == 'sigmoid' ): if( opts.coef0 != None ): coef0_list = parse_grid(opts.coef0, 0, float) else: if( opts.log2_coef0 != None ): coef0_list = parse_grid(opts.log2_coef0, 2) # base = 2 else: # default = {0.0625, 0.25, 1, 4} coef0_list = [] for i in range(-4, 5, 2): coef0_list.append( 2**i ) arg_list = list( ParameterGrid( {'kernel': [opts.kernel], 'coef0': coef0_list, 'gamma': gamma_list, 'C': c_list } ) ) (acc_max, arg_best) = parallel_cross_validation(SVC, 'SVM', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- C = %f, coef0 = %f, gamma = %f" %(acc_max, arg_best['C'], arg_best['coef0'], arg_best['gamma']) print "#####################################################################################" else: print "Error! Unknown kernel %s!" %opts.kernel traceback.print_stack() sys.exit(1) ############################################################ ## linear-SVM ## ############################################################ elif opts.model == 'LINEARSVM': penalty_list = [] if( opts.penalty == None ): penalty_list.append('l2') penalty_list.append('l1') else: penalty_list.append( opts.penalty ) loss_list = [] if( opts.loss == None ): loss_list.append('l2') loss_list.append('l1') else: loss_list.append( opts.loss ) # parameter C if( opts.C != None ): c_list = parse_grid(opts.C, 0, float) else: if( opts.log2_C != None ): c_list = parse_grid(opts.log2_C, 2) # base = 2 else: # default = {1, 2, 4, 8, 16, 32, 64, 128} c_list = [] for i in range(0, 8): c_list.append( 2**i ) arg_list_pre = list( ParameterGrid( {'penalty': penalty_list, 'loss': loss_list, 'C': c_list} ) ) arg_list = [] for arg in arg_list_pre: if( arg['penalty'] == 'l1' and arg['loss'] == 'l1' ): # not support continue if( arg['penalty'] == 'l1' ): arg['dual'] = False arg_list.append(arg) (acc_max, arg_best) = parallel_cross_validation(LinearSVC, 'Linear-SVM', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- C = %f, penalty = %s, loss = %s" %(acc_max, arg_best['C'], arg_best['penalty'], arg_best['loss']) print "#####################################################################################" ############################################################ ## Linear model with SGD ## ############################################################ elif opts.model == 'SGD': if( opts.alpha != None ): alpha_list = parse_grid(opts.alpha, 0, float) else: if( opts.log2_alpha != None ): alpha_list = parse_grid(opts.log2_alpha, 2) # base = 2 else: # default = {0.031325, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4} alpha_list = [] for i in range(-5, 3): alpha_list.append( 2**i ) loss_list = [] if( opts.loss == None ): loss_list.append('hinge') loss_list.append('log') loss_list.append('modified_huber') loss_list.append('squared_hinge') loss_list.append('perceptron') loss_list.append('squared_loss') loss_list.append('huber') loss_list.append('epsilon_insensitive') loss_list.append('squared_epsilon_insensitive') else: loss_list.append(opts.loss) penalty_list = [] if( opts.penalty == None ): penalty_list.append('l2') penalty_list.append('l1') penalty_list.append('elasticnet') else: penalty_list.append(opts.penalty) arg_list = list( ParameterGrid( {'alpha': alpha_list, 'loss':loss_list, 'penalty':penalty_list} ) ) (acc_max, arg_best) = parallel_cross_validation(SGDClassifier, 'Linear-SGD', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- alpha = %f, loss = %s, penalty = %s" %(acc_max, arg_best['alpha'], arg_best['loss'], arg_best['penalty']) print "#####################################################################################" ############################################################ ## Random Forest ## ############################################################ elif opts.model == 'RF': if( opts.n_estimators != None ): ne_list = parse_grid(opts.n_estimators, 0) else: # default = {50, 100, 150, 200, 250, 300} ne_list = [] for i in range(5, 31, 5): ne_list.append( 10*i ) arg_list = list( ParameterGrid( {'n_estimators': ne_list} ) ) (acc_max, arg_best) = parallel_cross_validation(RandomForestClassifier, 'Random Forest', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- n_estimators = %d" %(acc_max, arg_best['n_estimators']) print "#####################################################################################" ############################################################ ## AdaBoost ## ############################################################ elif opts.model == 'ADABOOST': be_DT = DecisionTreeClassifier() be_SVC = SVC(probability=True) be_SGD_huber = SGDClassifier(loss='modified_huber') be_SGD_log = SGDClassifier(loss='log') if( opts.base_estimator == None ): be = [ be_DT, be_SVC, be_SGD_huber, be_SGD_log ] elif( opts.base_estimator == 'DT' ): be = [ be_DT ] elif( opts.base_estimator == 'SVM' ): be = [ be_SVC ] elif( opts.base_estimator == 'SGD' ): be = [ be_SGD_huber , be_SGD_log ] elif( opts.base_estimator == 'SGD-HUBER' ): be = [ be_SGD_huber ] elif( opts.base_estimator == 'SGD-LOG' ): be = [ be_SGD_log ] else: print "Unkinown base estimator %s !" %opts.base_estimator traceback.print_stack() sys.exit(1) if( opts.n_estimators != None ): ne_list = parse_grid(opts.n_estimators, 0) else: # default = {50, 100, 150, 200, 250, 300} ne_list = [] for i in range(5, 31, 5): ne_list.append( 10*i ) if( opts.learning_rate != None ): lr_list = parse_grid(opts.learning_rate, 0, float) else: if( opts.log2_lr != None ): lr_list = parse_grid(opts.log2_lr, 2) else: # default = {0.25, 0.5, 1, 2} lr_list = [] for i in range(-2, 3): lr_list.append( 2**i ) arg_list = list( ParameterGrid( {'base_estimator': be, 'n_estimators': ne_list, 'learning_rate': lr_list} ) ) (acc_max, arg_best) = parallel_cross_validation(AdaBoostClassifier, 'AdaBoost', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- base_estimator = %s, n_estimators = %d, learning_rate = %f" %(acc_max, arg_best['base_estimator'], arg_best['n_estimators'], arg_best['learning_rate']) print "#####################################################################################" ############################################################ ## GradientBoost ## ############################################################ elif opts.model == 'GB': if( opts.n_estimators != None ): ne_list = parse_grid(opts.n_estimators, 0) else: # default = {50, 100, 150, 200, 250, 300} ne_list = [] for i in range(5, 31, 5): ne_list.append( 10*i ) if( opts.learning_rate != None ): lr_list = parse_grid(opts.learning_rate, 0, float) else: if( opts.log2_lr != None ): lr_list = parse_grid(opts.log2_lr, 2) else: # default = {0.25, 0.5, 1, 2} lr_list = [] for i in range(-2, 3): lr_list.append( 2**i ) arg_list = list( ParameterGrid( {'n_estimators': ne_list, 'learning_rate': lr_list} ) ) (acc_max, arg_best) = parallel_cross_validation(GradientBoostingClassifier, 'GradientBoosting', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- n_estimators = %d, learning_rate = %f" %(acc_max, arg_best['n_estimators'], arg_best['learning_rate']) print "#####################################################################################" ############################################################ ## KNN ## ############################################################ elif opts.model == 'KNN': if( opts.n_neighbors != None ): nn_list = parse_grid(opts.n_neighbors, 0) else: # default = {5, 10, 15, 20, 25} nn_list = [] for i in range(5): nn_list.append(5 + 10 * i) p_list = [] if( opts.degree == None ): p_list.append(1) p_list.append(2) else: p_list.append( opts.degree ) weight_list = [] if( opts.weights == None ): weight_list.append('distance') weight_list.append('uniform') else: weight_list.append( opts.weights ) arg_list = list( ParameterGrid( {'n_neighbors': nn_list, 'p': p_list, 'weights': weight_list} ) ) (acc_max, arg_best) = parallel_cross_validation(KNeighborsClassifier, 'KNN', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- n_neighbors = %d, weights = %s, p = %d" %(acc_max, arg_best['n_neighbors'], arg_best['weights'], arg_best['p']) print "#####################################################################################" ############################################################ ## Logistic Regression ## ############################################################ elif opts.model == 'LR': penalty_list = [] if( opts.penalty == None ): penalty_list.append('l2') penalty_list.append('l1') else: penalty_list.append(opts.penalty) if( opts.C != None ): c_list = parse_grid(opts.C, 0, float) else: if( opts.log2_C != None ): c_list = parse_grid(opts.log2_C, 2) # base = 2 else: # default = {1, 2, 4, 8, 16, 32, 64, 128} c_list = [] for i in range(0, 8): c_list.append( 2**i ) arg_list_pre = list( ParameterGrid( {'penalty': penalty_list, 'C': c_list} ) ) arg_list = [] for arg in arg_list_pre: if(arg['penalty'] == 'l2'): arg['dual'] = True arg_list.append(arg) (acc_max, arg_best) = parallel_cross_validation(LogisticRegression, 'Logistic Regression', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- C = %f, penalty = %s" %(acc_max, arg_best['C'], arg_best['penalty']) print "#####################################################################################" ############################################################ ## Ridge Regression ## ############################################################ elif opts.model == 'RIDGE': if( opts.alpha != None ): alpha_list = parse_grid(opts.alpha, 0, float) else: if( opts.log2_alpha != None ): alpha_list = parse_grid(opts.log2_alpha, 2) # base = 2 else: # default = {0.031325, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4} alpha_list = [] for i in range(-5, 3): alpha_list.append( 2**i ) arg_list = list( ParameterGrid( {'alpha': alpha_list} ) ) (acc_max, arg_best) = parallel_cross_validation(RidgeClassifier, 'Ridge', arg_list, x_train, y_train, opts.fold, opts.thread) print "#####################################################################################" print "max_acc = %f --- alpha = %f" %(acc_max, arg_best['alpha']) print "#####################################################################################" ############################################################ ## Gaussian Naive Bayes ## ############################################################ elif opts.model == 'GNB': print 'Run Gaussian Naive Bayes (%d-fold CV)' %(opts.fold) (acc, arg) = cross_validation( (GaussianNB, 'GNB', {}, x_train, y_train, opts.fold) ) print "#####################################################################################" print 'max acc = %f' % acc print "#####################################################################################" ############################################################ ## Linear Discriminant Analysis ## ############################################################ elif opts.model == 'LDA': print 'Run Linear Discriminant Analysis (%d-fold CV)' %(opts.fold) (acc, arg) = cross_validation( (LDA, 'LNA', {}, x_train, y_train, opts.fold) ) print "#####################################################################################" print "max_acc = %f " %(acc) print "#####################################################################################" else: sys.stderr.write('Error: invalid model %s\n' %opts.model) traceback.print_stack() sys.exit(1)