def rf(): """ Submission: rf_0708_01.csv 3000 trees E_val: 0.871837 E_in: 0.999998 E_out: 0.882316801296279 15000 trees E_val: 0.872011 E_in: 0.999998 E_out: 0.8824869811781106 30000 trees E_val: 0.871928 E_in: E_out: depth=4; 12000 trees E_val: 0.969158 E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier import numpy as np X, y = dataset.load_train(depth=1) raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) del X import gc gc.collect() rf = RandomForestClassifier(n_estimators=12000, oob_score=True, n_jobs=-1, class_weight='auto') rf.fit(X_scaled, y) logger.debug('RandomForestClassifier fitted') logger.debug('E_val(oob): %f', rf.oob_score_) logger.debug('E_in(full): %f', Util.auc_score(rf, X_scaled, y)) X, y = dataset.load_train() X_scaled = raw_scaler.transform(X) logger.debug('E_in (depth=0): %f', Util.auc_score(rf, X_scaled, y)) del X gc.collect() IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('rf', rf)]), 'rf_0708_01') logger.debug('caching fitted RandomForestClassifier') IO.cache(rf, Path.of_cache('rf.RandomForestClassifier.12000.pkl')) logger.debug('cached fitted RandomForestClassifier')
def run(args: Namespace): """ actual function which is doing some task Args: args: program arguments """ Trainer(args, dataset.load_train(args.train)).run()
def svc_test2(): """ Submission: E_val: E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.calibration import CalibratedClassifierCV X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) svc = SVC(kernel='linear', class_weight='auto', cache_size=10240) svc.fit(X_scaled, y) isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5), method='isotonic') isotonic.fit(X_scaled, y) logger.debug('Got best isotonic CalibratedClassifier.') logger.debug('E_in (isotonic): %f', Util.auc_score(isotonic, X_scaled, y))
def q20(): def cross_validate(X, y, lamda): Xs = X[:40], X[40:80], X[80:120], X[120:160], X[160:] ys = y[:40], y[40:80], y[80:120], y[120:160], y[160:] reg = ridge.RidgeRegression(lamda) e_cv = 0.0 for i in range(5): X_val, y_val = Xs[i], ys[i] X_train = np.concatenate([Xs[j] for j in range(5) if i != j]) y_train = np.concatenate([ys[j] for j in range(5) if i != j]) reg.fit(X_train, y_train) e_cv += reg.evaluate(X_val, y_val, sign) return e_cv / 5 X_train, y_train = load_train() X_test, y_test = load_test() lamdas = np.logspace(-10, 2, 13) best = (1.0, None) for lamda in lamdas: e_cv = cross_validate(X_train, y_train, lamda) if e_cv <= best[0]: best = (e_cv, lamda) best_reg = ridge.RidgeRegression(best[1]) best_reg.fit(X_train, y_train) print "lamda: %e, E_in: %.3f, E_out: %.3f" % (best[1], best_reg.evaluate(X_train, y_train, sign), best_reg.evaluate(X_test, y_test, sign))
def bagging_lr(): """ Submission: bagging_lr_0707_02.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegression from sklearn.ensemble import BaggingClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) bag = BaggingClassifier(LogisticRegression(class_weight='auto'), n_estimators=3000, oob_score=True, n_jobs=-1, verbose=2) logger.debug('E_val (oob): %f', bag.oob_score_) logger.debug('E_in: %f', Util.auc_score(bag, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('bag', bag)]), 'bagging_lr_0707_02')
def lr_with_scale3(): """ Check the performance of normalizing TEST SET. Submission: lr_with_scale3_0707_04.csv E_val: E_in: 0.879233 E_out: 0.8770121701777971 Submission: lr_with_scale3_0712_01.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) clf = LogisticRegression(C=0.03, class_weight='auto') clf.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale3_0712_01') scores = cross_val_score(clf, X_scaled, y, scoring='roc_auc', n_jobs=-1) logger.debug('E_val: %f <- %s', np.average(scores), scores)
def trainSVM(train_path, classes, image_size=32, model_file='model.pkl', download_flag=False): print('Training with HOG feature and SVM') if download_flag: print('Downloading images') for i in classes: iurl.downloadImage(i + '.txt', train_path + '/' + i) X, Y, ids, cls, testX, testY, ids, cls, num = dataset.load_train( train_path, image_size, classes, 5, True) print("Number of training data: {}".format(num)) rand = np.random.RandomState(321) shuffle = rand.permutation(len(X)) # print(X) # print(testX) # print(Y) # print(len(X[:-1]),len(Y)) X = X[shuffle] Y = Y[shuffle] # print(X) # print(Y) # pause() clf = svm.SVC() clf.fit(X, Y) result = clf.predict(testX) mask = result == testY correct = np.count_nonzero(mask) # print result print("Accuracy: {}", correct * 100.0 / len(result)) joblib.dump(clf, model_file)
def svc_appr(): """ Best params: {'C': 0.022139881953014046} Submission: E_val: E_in: E_out: """ from sklearn.svm import LinearSVC from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import RandomizedSearchCV from scipy.stats import expon X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), verbose=2, param_distributions={'C': expon()}) rs.fit(X_scaled, y) logger.debug('Got best SVC.') logger.debug('Best params: %s', rs.best_params_) logger.debug('Grid scores:') for i, grid_score in enumerate(rs.grid_scores_): print('\t%s' % grid_score) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
def __init__(self, train_path, image_size, classes, batch_size): self.graph = tf.Graph() with self.graph.as_default(): self.num_classes = len(classes) self.image_size = image_size images, labels, _, _ = load_train(train_path, image_size, classes) self.X, self.labels = get_data_batch(images, labels, batch_size, num_threads=8) self.Y = tf.one_hot(self.labels, depth=self.num_classes, axis=1, dtype=tf.float32) self.batch_size = batch_size # self.X, self.labels = get_batch_data('mnist', batch_size, 4) # self.Y = tf.one_hot(self.labels, depth=10, axis=1, dtype=tf.float32) self.build_arch() self.loss() self.model_summary() self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer() self.train_op = self.optimizer.minimize( self.total_loss, global_step=self.global_step)
def ada_boost_dt(): """ Submission: ada_boost_dt_0707_03.csv E_val: 0.854350 E_in: 0.889561 E_out: 0.8832315976033993 """ from sklearn.ensemble import AdaBoostClassifier from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) ab = AdaBoostClassifier(n_estimators=300) scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1) logger.debug('CV: %s', scores) logger.debug('E_val: %f', sum(scores) / len(scores)) ab.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('ab', ab)]), 'ada_boost_dt_0707_03')
def trainCNN(train_path, classes, image_size = 32, model_file = 'model.pkl', download_flag = False): if download_flag: for i in classes: iurl.downloadImage(i+'.txt', train_path+'/'+i) #iurl.downloadImage(catsUrlFile,train_path+'/cats') #iurl.downloadImage(dogsUrlFile,train_path+'/dogs') X, Y, ids, cls, testX, testY, ids, cls, num= dataset.load_train(train_path,image_size,classes,5) #testX, testY, ids, cls= dataset.load_train(test_path,image_size,classes) print('Number of test data:{}'.format(num)) X = X.reshape([-1, image_size, image_size, 1]) testX = testX.reshape([-1, image_size, image_size, 1]) # Building convolutional network network = input_data(shape=[None, image_size, image_size, 1], name='input') network = conv_2d(network, 16, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 2) network = local_response_normalization(network) network = conv_2d(network, 32, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 2) network = local_response_normalization(network) network = fully_connected(network, 32, activation='tanh') network = dropout(network, 0.8) network = fully_connected(network, 64, activation='tanh') network = dropout(network, 0.8) network = fully_connected(network, len(classes), activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy', name='target') # Training model = tflearn.DNN(network, tensorboard_verbose=0) print("start") model.fit({'input': X}, {'target': Y}, n_epoch=20, shuffle = True, batch_size = 100, validation_set=({'input': testX}, {'target': testY}), snapshot_step=100, show_metric=True, run_id='convnet_mnist') print(model.predict(testX)) model.save(model_file)
def rf2(): """ Submission: rf2_0704_04.csv 3000 trees E_val: 0.871431 E_in: 0.999998 E_out: 30000 trees E_val: E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rf = RandomForestClassifier(n_estimators=30000, oob_score=True, n_jobs=-1, class_weight='auto', max_features='log2') rf.fit(X_scaled, y) logger.debug('Eval(oob): %f', rf.oob_score_) logger.debug('Ein: %f', Util.auc_score(rf, X_scaled, y)) IO.cache(rf, Path.of_cache('rf.RandomForestClassifier.log2.pkl')) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('rf', rf)]), 'rf2_0704_04')
def lr_with_scale2(): """ Submission: lr_with_scale2_0704_03.csv E_val: E_in: 0.878996 E_out: 0.8768131004917349 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) clf = LogisticRegressionCV(Cs=50, cv=5, scoring='roc_auc', n_jobs=-1, class_weight='auto') clf.fit(X_scaled, y) logger.debug('Best C: %f', clf.C_[0]) logger.debug('Cs: %s', clf.Cs_) logger.debug('Grid scores: %f', clf.scores_) logger.debug('Ein: %f', Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale2_0704_03')
def gbdt(): """ Submission: gbdt_0708_02.csv n_estimators: 1000, learning_rate: 0.1, subsample: 0.5 E_val: 0.858235 E_in: 0.908622 E_out: 0.8873906795559863 n_estimators: 500, learning_rate: 0.1, subsample: 0.5 E_val: 0.870976 E_in: 0.899593 E_out: 0.88711101837711 n_estimators: 3000, learning_rate: 0.1, subsample: 0.5 E_val: 0.836049 E_in: 0.936056 E_out: 0.8833930861722906 depth=4; n_estimators: 1000, learning_rate: 0.1, subsample: 0.5 E_val: 0.947301 E_in: 0.983812 (on depth=4) // 0.85089646325496504 (on depth=0) E_out: 0.8855316272153549 """ from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline import numpy as np gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, subsample=0.5) d = 0 X, y = dataset.load_train(depth=d) raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) gb.fit(X_scaled, y) IO.cache(gb, Path.of_cache('gbdt.GradientBoostingClassifier.d%d.pkl' % d)) IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', gb)]), 'gbdt_0708_02.1000.d%d' % d) logger.debug('E_in(full): %f', Util.auc_score(gb, X_scaled, y)) X, y = dataset.load_train() X_scaled = raw_scaler.transform(X) logger.debug('E_in(depth=0): %f', Util.auc_score(gb, X_scaled, y))
def test(): X_train, y_train = load_train() X_test, y_test = load_test() lamda = 0 reg = ridge.RidgeRegression(lamda) reg.fit(X_train, y_train) e_in = reg.evaluate(X_train, y_train, sign) e_out = reg.evaluate(X_test, y_test, sign) print "E_in: %.3f, E_out: %.3f" % (e_in, e_out)
def lr_with_fs(): """ Submission: lr_with_fs_0703_01.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV import pylab as pl X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) pkl_path = Path.of_cache('lr_with_fs.RFECV.pkl') rfe = IO.fetch_cache(pkl_path) if rfe is None: rfe = RFECV(estimator=LogisticRegression(class_weight='auto'), cv=StratifiedKFold(y, 5), scoring='roc_auc') rfe.fit(X_scaled, y) IO.cache(rfe, pkl_path) print("Optimal number of features : %d" % rfe.n_features_) # Plot number of features VS. cross-validation scores pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (AUC)") pl.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_) pl.savefig('lr_with_fs.refcv') X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print('CV scores: %s' % clf.scores_) print('Ein: %f' % Util.auc_score(clf, X_new, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0703_01')
def gbdt_search(): """ Grid search for best n_estimators. Best params: {'loss': 'deviance', 'n_estimators': 100} Submission: gbdt_search_0707_01.csv E_val: 0.883786743214 E_in: 0.887785 E_out: 0.8848760405053878 """ from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold from sklearn.pipeline import Pipeline import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() X_scaled = raw_scaler.fit_transform(X) param_grid = { 'loss': ['deviance', 'exponential'], 'n_estimators': np.arange(100, 1001, 100) } params = {'learning_rate': 0.1, 'subsample': 0.5} gb = GradientBoostingClassifier(**params) grid = GridSearchCV(gb, param_grid, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), refit=True, verbose=1) grid.fit(X_scaled, y) logger.debug('Got best GBDT.') logger.debug('Grid scores: ') for i, grid_score in enumerate(grid.grid_scores_): print('\t%d00: %s' % (i + 1, grid_score)) logger.debug('Best score (E_val): %s', grid.best_score_) logger.debug('Best params: %s', grid.best_params_) IO.cache(grid, Path.of_cache('gbdt_search.GridSearchCV.pkl')) X_test = dataset.load_test() raw_scaler.fit(np.r_[X, X_test]) X_scaled = raw_scaler.transform(X) params.update(grid.best_params_) clf = GradientBoostingClassifier(**params) clf.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(grid, X_scaled, y)) IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', grid)]), 'gbdt_search_0707_01')
def q15(): X_train, y_train = load_train() X_test, y_test = load_test() lamdas = np.logspace(-10, 2, 13) best = (1.0, None) for lamda in lamdas: reg = ridge.RidgeRegression(lamda) reg.fit(X_train, y_train) e_out = reg.evaluate(X_test, y_test, sign) if e_out <= best[0]: best = (e_out, reg) best_reg = best[1] print "lamda: %e, E_in: %.3f, E_out: %.3f" % ( best_reg.lamda, best_reg.evaluate(X_train, y_train, sign), best_reg.evaluate(X_test, y_test, sign))
def gbdt_grid(): """ Grid search for best params. Best params: {'learning_rate': 0.05, 'subsample': 0.3} Submission: gbdt_grid_0706_03.csv E_val: 0.860118290628 E_in: 0.882949 E_out: 0.8809314555068068 """ from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold from sklearn.pipeline import Pipeline import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() X_scaled = raw_scaler.fit_transform(X) param_grid = { 'learning_rate': [0.05, 0.1], 'subsample': [0.3, 0.5, 0.7] } grid = GridSearchCV(GradientBoostingClassifier(n_estimators=3000), param_grid, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), refit=False, verbose=1) grid.fit(X_scaled, y) logger.debug('Got best GBDT.') logger.debug('Grid scores: %s', grid.grid_scores_) logger.debug('Best score (E_val): %s', grid.best_score_) logger.debug('Best params: %s', grid.best_params_) X_test = dataset.load_test() raw_scaler.fit_transform(np.r_[X, X_test]) X_scaled = raw_scaler.transform(X) clf = GradientBoostingClassifier(**grid.best_params_) clf.fit(X_scaled, y) IO.cache(grid, Path.of_cache('gbdt_grid.GridSearchCV.pkl')) logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', clf)]), 'gbdt_grid_0706_03')
def svc(): """ Submission: svc_0703_04.csv E_val: E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import RandomizedSearchCV from sklearn.calibration import CalibratedClassifierCV from scipy.stats import expon X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) svc = SVC(kernel='linear', class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), param_distributions={'C': expon()}) rs.fit(X_scaled, y) logger.debug('Got best SVC.') logger.debug('Grid scores: %s', rs.grid_scores_) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('Best params: %s', rs.best_params_) IO.cache(rs, Path.of_cache('svc.RandomizedSearchCV.SVC.pkl')) svc = rs.best_estimator_ IO.cache(rs, Path.of_cache('svc.SVC.pkl')) isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5), method='isotonic') isotonic.fit(X_scaled, y) IO.cache(rs, Path.of_cache('svc.CalibratedClassifierCV.isotonic.pkl')) logger.debug('Got best isotonic CalibratedClassifier.') logger.debug('E_in (isotonic): %f', Util.auc_score(isotonic, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('svc', isotonic)]), 'svc_0703_04')
def q17(): X, y = load_train() X_train, X_val = X[:120], X[120:] y_train, y_val = y[:120], y[120:] X_test, y_test = load_test() lamdas = np.logspace(-10, 2, 13) best = (1.0, None) for lamda in lamdas: reg = ridge.RidgeRegression(lamda) reg.fit(X_train, y_train) e_val = reg.evaluate(X_val, y_val, sign) if e_val <= best[0]: best = (e_val, reg) best_reg = best[1] print "lamda: %e, E_train: %.3f, E_val: %.3f, E_out: %.3f" % ( best_reg.lamda, best_reg.evaluate(X_train, y_train, sign), best_reg.evaluate(X_val, y_val, sign), best_reg.evaluate(X_test, y_test, sign))
def erf(): """ Submission: erf_0705_01.csv 3000 trees E_val: 0.870800 E_in: 0.999998 E_out: 15000 trees E_val: E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.ensemble import ExtraTreesClassifier X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) del X rf = ExtraTreesClassifier(n_estimators=3000, oob_score=True, n_jobs=-1, class_weight='auto', bootstrap=True) rf.fit(X_scaled, y) logger.debug('ExtraTreesClassifier fitted') import gc gc.collect() logger.debug('Eval(oob): %f', rf.oob_score_) logger.debug('Ein: %f', Util.auc_score(rf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('erf', rf)]), 'erf_0705_01') logger.debug('caching fitted ExtraTreesClassifier') IO.cache(rf, Path.of_cache('erf.ExtraTreesClassifier.auto.pkl')) logger.debug('cached fitted ExtraTreesClassifier')
def lr_with_fs1(): """ Submission: lr_with_fs1_0703_03.csv E_val: E_in: 0.876954 E_out: """ from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) pkl_path = Path.of_cache('lr_with_fs1.LR.FS.pkl') lr = IO.fetch_cache(pkl_path) if lr is None: lr = LogisticRegression(class_weight='auto') lr.fit(X_scaled, y) IO.cache(lr, pkl_path) X_pruned = lr.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print('CV scores: %s' % clf.scores_) print('Ein: %f' % Util.auc_score(clf, X_new, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('fs', lr), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs1_0703_03')
def erf2(): """ Submission: erf2_0705_02.csv 3000 trees E_val: [0.83766072, 0.89704662, 0.85299486, 0.8639041, 0.82955865] E_in: 1.000000 E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.ensemble import ExtraTreesClassifier from sklearn.cross_validation import cross_val_score X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) del X import gc gc.collect() erf = ExtraTreesClassifier(n_estimators=3000, n_jobs=-1, class_weight='auto') scores = cross_val_score(erf, X_scaled, y, cv=5, n_jobs=-1) logger.debug('CV: %s', scores) logger.debug('Eval: %f', sum(scores) / len(scores)) erf.fit(X_scaled, y) logger.debug('ExtraTreesClassifier fitted') logger.debug('Ein: %f', Util.auc_score(erf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('erf', erf)]), 'erf2_0705_02') logger.debug('caching fitted ExtraTreesClassifier') IO.cache(erf, Path.of_cache('erf2.ExtraTreesClassifier.auto.pkl')) logger.debug('cached fitted ExtraTreesClassifier')
def knn(): """ Submission: knn_0704_01.csv E_val: E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV from sklearn.neighbors import KNeighborsClassifier import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) knn = KNeighborsClassifier() params = { 'n_neighbors': np.arange(5, 51, 5), 'weights': ['uniform', 'distance'], 'leaf_size': np.arange(30, 201, 10) } grid = GridSearchCV(knn, params, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5)) grid.fit(X_scaled, y) logger.debug('Got best kNN.') logger.debug('Grid scores: %s', grid.grid_scores_) logger.debug('Best score (E_val): %s', grid.best_score_) logger.debug('Best params: %s', grid.best_params_) IO.cache(grid, Path.of_cache('knn.GridSearchCV.KNeighborsClassifier.pkl')) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('knn', grid)]), 'knn_0704_01')
def lr_with_scale(): """ Submission: lr_with_scale_0703_01.csv E_val: <missing> E_in: 0.878883 E_out: 0.8766589627938616 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_scaled, y) print('CV scores: %s' % clf.scores_) print('Ein: %f' % Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale_0703_01')
# Convolutional Layer 2. filter_size2 = 3 num_filters2 = 32 # Convolutional Layer 3. filter_size3 = 3 num_filters3 = 64 # Fully-connected layer. fc_size = 128 # Number of neurons in fully-connected layer. # Number of color channels for the images: 1 channel for gray-scale. num_channels = 3 # image dimensions (only squares for now) img_size = 256 # Size of image when flattened to a single dimension img_size_flat = img_size * img_size * num_channels # Tuple with height and width of images used to reshape arrays. img_shape = (img_size, img_size) # class info classes = ['Pool', 'NonPool'] num_classes = len(classes) test_data = dataset.DataSet(*dataset.load_train(input_dir, img_size, classes)) print(test_data.images)
print "Zero-one classification loss", zero_one_loss(y_test_mlb, y_pred) print "Hamming loss", hamming_loss(y_test_mlb, y_pred) im = y_test_mlb + y_pred * 2 scipy.misc.imsave("predictions.png", im) if __name__ == "__main__": # Load data print "Loading labels" label_list = dataset.load_labels() print "Loading train set" X_train, y_train, filenames_train = dataset.load_train() print "Size of train set", len(X_train) multilabel_classifier(X_train, y_train) # Unload train set from memory del X_train, y_train, filenames_train print "Loading test set" X_test, y_test, filenames_test = dataset.load_test() print "Size of test set", len(X_test) predict(X_test, y_test) improve_predictions(use_infer_topology=True) # evaluate_multilabel(y_test, label_list, '../models/pred_ml_improved.pkl') evaluate_multilabel(y_test, label_list, "../models/pred_ml.pkl")
def gbdt_oob(): from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import KFold, train_test_split import pylab as pl import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) raw_scaler.fit(X_train) X_train = raw_scaler.transform(X_train) X_test = raw_scaler.transform(X_test) n_estimators = 1000 params = {'n_estimators': n_estimators, 'loss': 'deviance', 'learning_rate': 0.1, 'subsample': 0.5} gb = GradientBoostingClassifier(**params) gb.fit(X_train, y_train) # IO.cache(gb, Path.of_cache('gbdt.GradientBoostingClassifier.pkl')) logger.debug('Eval: %f', Util.auc_score(gb, X_test, y_test)) logger.debug('Ein: %f', Util.auc_score(gb, X_train, y_train)) x = np.arange(n_estimators) + 1 def heldout_score(clf, X_test, y_test): """compute deviance scores on ``X_test`` and ``y_test``. """ score = np.zeros((n_estimators,), dtype=np.float64) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): score[i] = clf.loss_(y_test, y_pred) return score def cv_estimate(n_folds=3): cv = KFold(n=X_train.shape[0], n_folds=n_folds) cv_clf = GradientBoostingClassifier(**params) val_scores = np.zeros((n_estimators,), dtype=np.float64) for train, test in cv: cv_clf.fit(X_train[train], y_train[train]) val_scores += heldout_score(cv_clf, X_train[test], y_train[test]) val_scores /= n_folds return val_scores cv_score = cv_estimate(3) test_score = heldout_score(gb, X_test, y_test) cumsum = -np.cumsum(gb.oob_improvement_) oob_best_iter = x[np.argmin(cumsum)] test_score -= test_score[0] test_best_iter = x[np.argmin(test_score)] cv_score -= cv_score[0] cv_best_iter = x[np.argmin(cv_score)] oob_color = list(map(lambda x: x / 256.0, (190, 174, 212))) test_color = list(map(lambda x: x / 256.0, (127, 201, 127))) cv_color = list(map(lambda x: x / 256.0, (253, 192, 134))) # IO.cache(cumsum, Path.of_cache('gbdt.cumsum.pkl')) # IO.cache(test_score, Path.of_cache('gbdt.test_score.pkl')) # IO.cache(cv_score, Path.of_cache('gbdt.cv_score.pkl')) pl.plot(x, cumsum, label='OOB loss', color=oob_color) pl.plot(x, test_score, label='Test loss', color=test_color) pl.plot(x, cv_score, label='CV loss', color=cv_color) pl.axvline(x=oob_best_iter, color=oob_color) pl.axvline(x=test_best_iter, color=test_color) pl.axvline(x=cv_best_iter, color=cv_color) xticks = pl.xticks() xticks_pos = np.array(xticks[0].tolist() + [oob_best_iter, cv_best_iter, test_best_iter]) xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) + ['OOB', 'CV', 'Test']) ind = np.argsort(xticks_pos) xticks_pos = xticks_pos[ind] xticks_label = xticks_label[ind] pl.xticks(xticks_pos, xticks_label) pl.legend(loc='upper right') pl.ylabel('normalized loss') pl.xlabel('number of iterations') pl.savefig('gbdt.oob')
def gbdt2(): """ Submission: gbdt2_0708_03.csv n_estimators: 1000, learning_rate: 0.1, subsample: 0.5 E_val: 0.852035 E_in: 0.910251 E_out: 0.8874428893001793 n_estimators: 3000, learning_rate: 0.1, subsample: 0.5 E_val: 0.827988 E_in: 0.938593 E_out: 0.8844206314551558 depth=4; n_estimators: 1000, learning_rate: 0.1, subsample: 0.5 E_val: 0.941602 E_in: 0.983938 (on depth=4) // 0.87209181108731892 (on depth=0) E_out: 0.8872206627768779 depth=0: E_val: E_in: 0.909368 // 0.909368 E_out: 0.8864839071529611 depth=1: E_val: E_in: 0.956676 // 0.903537 E_out: 0.8851856544683128 depth=2: E_val: E_in: 0.971240 // 0.899843 E_out: depth=3: E_val: E_in: 0.978190 // 0.896956 E_out: """ from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline import numpy as np gb = GradientBoostingClassifier(loss='exponential', n_estimators=1000, learning_rate=0.1, subsample=0.5) d = 3 X, y = dataset.load_train(depth=d) raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) gb.fit(X_scaled, y) IO.cache(gb, Path.of_cache('gbdt2.GradientBoostingClassifier.d%d.pkl' % d)) IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', gb)]), 'gbdt2_0708_03.1000.d%d' % d) logger.debug('E_in(full): %f', Util.auc_score(gb, X_scaled, y)) X, y = dataset.load_train() X_scaled = raw_scaler.transform(X) logger.debug('E_in(depth=0): %f', Util.auc_score(gb, X_scaled, y))
print "Zero-one classification loss", zero_one_loss(y_test_mlb, y_pred) print "Hamming loss", hamming_loss(y_test_mlb, y_pred) im = y_test_mlb + y_pred * 2 scipy.misc.imsave('predictions.png', im) if __name__ == '__main__': #Load data print "Loading labels" label_list = dataset.load_labels() print "Loading train set" X_train, y_train, filenames_train = dataset.load_train() print "Size of train set", len(X_train) multilabel_classifier(X_train, y_train) #Unload train set from memory del X_train, y_train, filenames_train print "Loading test set" X_test, y_test, filenames_test = dataset.load_test() print "Size of test set", len(X_test) predict(X_test, y_test) improve_predictions(use_infer_topology=True) #evaluate_multilabel(y_test, label_list, '../models/pred_ml_improved.pkl') evaluate_multilabel(y_test, label_list, '../models/pred_ml.pkl')