def main(): # widen pd output for debugging pd.set_option('display.width', 1000) parser = argparse.ArgumentParser() parser.add_argument("--input", help="Input file to parse (CSV)") parser.add_argument("--kaggle-test-set", help="Kaggle test set preprocessed (CSV)") parser.add_argument("--export-test", help="Export baseline predictions to CSV", action="store_true") parser.add_argument("--baseline", help="Train a baseline model and report error", action="store_true") parser.add_argument("--baseline-learning-curves", help="Build baseline learning curves with f1 score", action="store_true") parser.add_argument("--pca-learning-curves", help="Build learning curves based on pca", action="store_true") parser.add_argument("--feature-importance-learning-curves", help="Build learning curves based on feature importance", action="store_true") parser.add_argument("--estimators-learning-curves", help="Build learning curves based on the number of trees", action="store_true") parser.add_argument("--grid-search", help="Grid search for best model", action="store_true") parser.add_argument("--best-model-to-kaggle", help="Predict Kaggle test set using best model. Kaggle test set must be provided", action="store_true") args = parser.parse_args() if args.input is None: parser.print_help() sys.exit() input_file = args.input # set a random seed np.random.seed = 123 # load data df = pd.read_csv(input_file, sep=',') # split X and y y = df['Survived'] X = df.drop('Survived', axis=1) # we will use this model for our analysis model = RandomForestClassifier(oob_score=True, random_state=123) # 1. Establish a baseline if args.baseline: """ train a simple random forest model and get the output """ model.fit(X, y) print "Out of bag error : %f " % (model.oob_score_) print "Train error : %f " % (model.score(X, y)) # run on the kaggle test set if provided if args.kaggle_test_set: print "Generating Kaggle baseline" kaggle_set = pd.read_csv(args.kaggle_test_set, sep=',') # store the passengers Ids passengers_ids = pd.Series(kaggle_set['PassengerId']) kaggle_set.drop('PassengerId', axis=1, inplace=1) kaggle_pred = pd.Series(model.predict(kaggle_set), name='Survived') result = pd.concat([passengers_ids, kaggle_pred], axis=1) # save to csv result.to_csv(os.path.join('.', 'kaggle', 'baseline.csv'), sep=',', encoding='utf-8', index=False) # check feature importance. # http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html # store the feature list features_list = X.columns.values importances = model.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(20 if X.shape[1] >= 20 else X.shape[1]): print("%d. feature %s (%f)" % (f + 1, features_list[indices[f]], importances[indices[f]])) # 2. Baseline learning curves if args.baseline_learning_curves: train_scores, test_scores = helpers.rf_accuracy_by_sample_size(model, X, y, n_iter=10) plot = helpers.plot_learning_curves(title="Baseline Errors", train_scores=train_scores, test_scores=test_scores, with_variance=True, x_label="Observations") plot.savefig('./figures/Accuracy_baseline.png') # 3. PCA learning curves if args.pca_learning_curves: train_scores, test_scores = helpers.rf_accuracy_by_pca(model, X, y, to_scale=['cabin_count', 'age', 'fare', 'family_size']) plot = helpers.plot_learning_curves(title="PCA and Error", train_scores=train_scores, test_scores=test_scores, with_variance=False, x_label="Variance (in %)") plot.savefig('./figures/Accuracy_PCA_learning_curves.png') # 4. Feature importance learning curves if args.feature_importance_learning_curves: train_scores, test_scores = helpers.rf_accuracy_by_feature_importance(model, X, y) plot = helpers.plot_learning_curves(title="Feature importance and Error", train_scores=train_scores, test_scores=test_scores, with_variance=False, x_label="Feature importance (in %)") plot.savefig('./figures/Accuracy_feature_importance_learning_curves.png') # 5. Numer of trees learning curves if args.estimators_learning_curves: train_scores, test_scores = helpers.rf_accuracy_by_n_estimator(model, X, y) plot = helpers.plot_learning_curves(title="Number of trees and Error", train_scores=train_scores, test_scores=test_scores, with_variance=False, x_label="Number of trees") plot.savefig('./figures/Accuracy_n_estimator_learning_curves.png') # 6. Grid search parameters if args.grid_search: test_model = RandomForestClassifier(n_estimators=80, random_state=123) parameters = {'criterion': ['gini', 'entropy'], 'max_features': [.2, .5, .8, 'auto', None], 'max_depth': [3, 5, 10, 15, 20, None], 'min_samples_leaf': [1, 2, 5] } grid_search = GridSearchCV(test_model, parameters, verbose=1) grid_search.fit(X, y) # print report # http://scikit-learn.org/stable/auto_examples/randomized_search.html top_scores = sorted(grid_search.grid_scores_, key=itemgetter(1), reverse=True)[:10] for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores) )) print("Parameters: {0}".format(score.parameters)) print("") # print learning curves for the best 5 models for i, score in enumerate(top_scores): best_model = RandomForestClassifier(n_estimators=80, oob_score=True, random_state=123, criterion=score.parameters['criterion'], max_features=score.parameters['max_features'], max_depth=score.parameters['max_depth'], min_samples_leaf=score.parameters['min_samples_leaf'] ) train_scores, test_scores = helpers.rf_accuracy_by_sample_size(best_model, X, y, n_iter=10) plot = helpers.plot_learning_curves(title="Model " + str(i+1) + " Errors", train_scores=train_scores, test_scores=test_scores, with_variance=True, x_label="Observations") plot.savefig('./figures/Model_' + str(i + 1) + '_Accuracy_baseline.png') # 7. Kaggle submission for best model as determined by the grid search if args.best_model_to_kaggle: # this is our best model best_model = RandomForestClassifier(n_estimators=80, max_features=None, criterion="entropy", max_depth=10, min_samples_leaf=1, random_state=123) best_model.fit(X, y) print "Generating Kaggle submission" kaggle_set = pd.read_csv(args.kaggle_test_set, sep=',') # store the passengers Ids passengers_ids = pd.Series(kaggle_set['PassengerId']) kaggle_set.drop('PassengerId', axis=1, inplace=1) kaggle_pred = pd.Series(best_model.predict(kaggle_set), name='Survived') result = pd.concat([passengers_ids, kaggle_pred], axis=1) # save to csv result.to_csv(os.path.join('.', 'kaggle', 'best_model.csv'), sep=',', encoding='utf-8', index=False)
def main(): args = get_args() print("args : ", args) # Fix seed if args.seed is not None: torch.manual_seed(random_seed) torch.cuda.manual_seed(random_seed) torch.cuda.manual_seed_all(random_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(random_seed) random.seed(random_seed) warnings.warn( 'You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting from checkpoints.' ) assert args.crop_size[0] <= args.train_size[0] and args.crop_size[1] <= args.train_size[1], \ 'Must be Crop size <= Image Size.' # Create directory to store run files if not os.path.isdir(args.save_path): os.makedirs(args.save_path + '/images') if not os.path.isdir(args.save_path + '/results_color_val'): os.makedirs(args.save_path + '/results_color_val') os.makedirs(args.save_path + '/results_color_test') Dataset = MiniCity dataloaders = get_dataloader(Dataset, args) criterion = get_lossfunc(Dataset, args) model = get_model(Dataset, args) print(model) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_init, momentum=args.lr_momentum, weight_decay=args.lr_weight_decay) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) # Initialize metrics best_miou = 0.0 metrics = { 'train_loss': [], 'train_acc': [], 'val_acc': [], 'val_loss': [], 'miou': [] } start_epoch = 0 # Resume training from checkpoint if args.weights: print('Resuming training from {}.'.format(args.weights)) checkpoint = torch.load(args.weights) model.load_state_dict(checkpoint['model_state_dict'], strict=True) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) metrics = checkpoint['metrics'] best_miou = checkpoint['best_miou'] start_epoch = checkpoint['epoch'] + 1 # Push model to GPU if torch.cuda.is_available(): model = torch.nn.DataParallel(model).cuda() print('Model pushed to {} GPU(s), type {}.'.format( torch.cuda.device_count(), torch.cuda.get_device_name(0))) # No training, only running prediction on test set if args.predict: checkpoint = torch.load(args.save_path + '/best_weights.pth.tar') model.load_state_dict(checkpoint['model_state_dict'], strict=True) print('Loaded model weights from {}'.format(args.save_path + '/best_weights.pth.tar')) # Create results directory if not os.path.isdir(args.save_path + '/results_val'): os.makedirs(args.save_path + '/results_val') if not os.path.isdir(args.save_path + '/results_test'): os.makedirs(args.save_path + '/results_test') predict(dataloaders['test'], model, Dataset.mask_colors, folder=args.save_path, mode='test', args=args) predict(dataloaders['val'], model, Dataset.mask_colors, folder=args.save_path, mode='val', args=args) return # Generate log file with open(args.save_path + '/log_epoch.csv', 'a') as epoch_log: epoch_log.write( 'epoch, train loss, val loss, train acc, val acc, miou\n') since = time.time() for epoch in range(start_epoch, args.epochs): # Train print('--- Training ---') train_loss, train_acc = train_epoch(dataloaders['train'], model, criterion, optimizer, scheduler, epoch, void=Dataset.voidClass, args=args) metrics['train_loss'].append(train_loss) metrics['train_acc'].append(train_acc) print('Epoch {} train loss: {:.4f}, acc: {:.4f}'.format( epoch, train_loss, train_acc)) # Validate print('--- Validation ---') val_acc, val_loss, miou = validate_epoch( dataloaders['val'], model, criterion, epoch, Dataset.classLabels, Dataset.validClasses, void=Dataset.voidClass, maskColors=Dataset.mask_colors, folder=args.save_path, args=args) metrics['val_acc'].append(val_acc) metrics['val_loss'].append(val_loss) metrics['miou'].append(miou) # Write logs with open(args.save_path + '/log_epoch.csv', 'a') as epoch_log: epoch_log.write( '{}, {:.5f}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n'.format( epoch, train_loss, val_loss, train_acc, val_acc, miou)) # Save checkpoint torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'best_miou': best_miou, 'metrics': metrics, }, args.save_path + '/checkpoint.pth.tar') # Save best model to file if miou > best_miou: print('mIoU improved from {:.4f} to {:.4f}.'.format( best_miou, miou)) best_miou = miou torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), }, args.save_path + '/best_weights.pth.tar') time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) plot_learning_curves(metrics, args) # Load best model checkpoint = torch.load(args.save_path + '/best_weights.pth.tar') model.load_state_dict(checkpoint['model_state_dict'], strict=True) print('Loaded best model weights (epoch {}) from {}/best_weights.pth.tar'. format(checkpoint['epoch'], args.save_path)) # Create results directory if not os.path.isdir(args.save_path + '/results_val'): os.makedirs(args.save_path + '/results_val') if not os.path.isdir(args.save_path + '/results_test'): os.makedirs(args.save_path + '/results_test') # Run prediction on validation set. For predicting on test set, simple replace 'val' by 'test' predict(dataloaders['val'], model, Dataset.mask_colors, folder=args.save_path, mode='val', args=args)
def main(): # widen pd output for debugging pd.set_option('display.width', 1000) parser = argparse.ArgumentParser() parser.add_argument("--input", help="Input file to parse (CSV)") parser.add_argument("--kaggle-test-file", help="Kaggle test set preprocessed (CSV)") parser.add_argument("--baseline", help="Train a baseline model and report error", action="store_true") parser.add_argument("--baseline-learning-curves", help="Build baseline learning curves with f1 score", action="store_true") parser.add_argument("--export-test", help="Export baseline predictions to CSV", action="store_true") parser.add_argument("--grid-search", help="Execute grid search on SVM parameters", action="store_true") args = parser.parse_args() if args.input is None: parser.print_help() sys.exit() input_file = args.input # set a random seed np.random.seed = 123 # load data df = pd.read_csv(input_file, sep=',') # split X and y y = df['Survived'] X = df.drop('Survived', axis=1) # prepare a test set with 1/4 of the data X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=123) # scale features scaler = StandardScaler() to_scale = ['age', 'cabin_count', 'family_size', 'fare'] scaler.fit(X_train[to_scale]) X_train.loc[:, to_scale] = scaler.transform(X_train[to_scale]) X_test.loc[:, to_scale] = scaler.transform(X_test[to_scale]) # we will use this model for our analysis model = svm.SVC(random_state=123, verbose=True) # 1. Establish a baseline if args.baseline: """ train a simple SVM model and get the scores """ model.fit(X_train, y_train) y_pred = pd.Series(model.predict(X_test), name='Survived') print model.score(X_test, y_test) print classification_report(y_test, y_pred) # run on the kaggle test set if provided if args.kaggle_test_file: kaggle_set = pd.read_csv(args.kaggle_test_file, sep=',') # store the passengers Ids passengers_ids = pd.Series(kaggle_set['PassengerId']) kaggle_set.drop('PassengerId', axis=1, inplace=1) # scale kaggle_set.loc[:, to_scale] = scaler.transform(kaggle_set[to_scale]) kaggle_pred = pd.Series(model.predict(kaggle_set), name='Survived') result = pd.concat([passengers_ids, kaggle_pred], axis=1) # save to csv result.to_csv(os.path.join('.', 'kaggle', 'baseline_svm.csv'), sep=',', encoding='utf-8', index=False) if args.export_test: # change the name of the new column y_pred.name = "Predicted" result = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True), y_pred.reset_index(drop=True)], axis=1) # save to cv result.to_csv(os.path.join('.', 'predictions', 'baseline_svm_predictions.csv'), sep=',', encoding='utf-8', index=False) # 2. Baseline learning curves if args.baseline_learning_curves: model = svm.SVC(random_state=123, verbose=True, C=1.4, kernel='poly') cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=100, test_size=0.2, random_state=123) plot = helpers.plot_learning_curves_cv(model, X_train, y_train, cv=cv, n_jobs=4) plot.savefig('./figures/svm_cv_F1_baseline.png') # 3. Grid search parameters if args.grid_search: test_model = svm.SVC(random_state=123) parameters = {'C': np.linspace(.1, 2, 10), 'kernel': ['rbf', 'poly'], 'class_weight': ['auto', None] } grid_search = GridSearchCV(test_model, parameters, verbose=1) grid_search.fit(X, y) # print report # http://scikit-learn.org/stable/auto_examples/randomized_search.html top_scores = sorted(grid_search.grid_scores_, key=itemgetter(1), reverse=True)[:10] for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores) )) print("Parameters: {0}".format(score.parameters)) print("") # print learning curves for the best 5 models for i, score in enumerate(top_scores): best_model = svm.SVC(C=score.parameters['C'], kernel=score.parameters['kernel'], class_weight=score.parameters['class_weight'], random_state=123, ) train_scores, test_scores = helpers.f1_scores_by_sample_size(best_model, X_train, y_train, X_test, y_test, n_iter=10) plot = helpers.plot_learning_curves(title="Model " + str(i+1) + " Errors", train_scores=train_scores, test_scores=test_scores, with_variance=True, x_label="Observations") plot.savefig('./figures/Model_SVM_' + str(i + 1) + '_Accuracy_baseline.png')