def main():
    # widen pd output for debugging
    pd.set_option('display.width', 1000)

    parser = argparse.ArgumentParser()
    parser.add_argument("--input", help="Input file to parse (CSV)")
    parser.add_argument("--kaggle-test-set", help="Kaggle test set preprocessed (CSV)")
    parser.add_argument("--export-test", help="Export baseline predictions to CSV", action="store_true")
    parser.add_argument("--baseline", help="Train a baseline model and report error", action="store_true")
    parser.add_argument("--baseline-learning-curves", help="Build baseline learning curves with f1 score", action="store_true")
    parser.add_argument("--pca-learning-curves", help="Build learning curves based on pca", action="store_true")
    parser.add_argument("--feature-importance-learning-curves", help="Build learning curves based on feature importance", action="store_true")
    parser.add_argument("--estimators-learning-curves", help="Build learning curves based on the number of trees", action="store_true")
    parser.add_argument("--grid-search", help="Grid search for best model", action="store_true")
    parser.add_argument("--best-model-to-kaggle", help="Predict Kaggle test set using best model. Kaggle test set must be provided", action="store_true")
    args = parser.parse_args()



    if args.input is None:
        parser.print_help()
        sys.exit()
    input_file = args.input

    # set a random seed
    np.random.seed = 123

    # load data
    df = pd.read_csv(input_file, sep=',')


    # split X and y
    y = df['Survived']
    X = df.drop('Survived', axis=1)

    # we will use this model for our analysis
    model = RandomForestClassifier(oob_score=True, random_state=123)

    # 1. Establish a baseline
    if args.baseline:
        """
        train a simple random forest model and get the output
        """
        model.fit(X, y)
        print "Out of bag error : %f " % (model.oob_score_)
        print "Train error : %f " % (model.score(X, y))
        # run on the kaggle test set if provided
        if args.kaggle_test_set:
            print "Generating Kaggle baseline"
            kaggle_set = pd.read_csv(args.kaggle_test_set, sep=',')
            # store the passengers Ids
            passengers_ids = pd.Series(kaggle_set['PassengerId'])
            kaggle_set.drop('PassengerId', axis=1, inplace=1)
            kaggle_pred = pd.Series(model.predict(kaggle_set), name='Survived')
            result = pd.concat([passengers_ids, kaggle_pred], axis=1)
            # save to csv
            result.to_csv(os.path.join('.', 'kaggle', 'baseline.csv'), sep=',', encoding='utf-8', index=False)

        # check feature importance.
        # http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

        # store the feature list
        features_list = X.columns.values

        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]

        # Print the feature ranking
        print("Feature ranking:")

        for f in range(20 if X.shape[1] >= 20 else X.shape[1]):
                print("%d. feature %s (%f)" % (f + 1, features_list[indices[f]], importances[indices[f]]))

    # 2. Baseline learning curves
    if args.baseline_learning_curves:
        train_scores, test_scores = helpers.rf_accuracy_by_sample_size(model, X, y, n_iter=10)
        plot = helpers.plot_learning_curves(title="Baseline Errors", train_scores=train_scores, test_scores=test_scores, with_variance=True, x_label="Observations")
        plot.savefig('./figures/Accuracy_baseline.png')

    # 3. PCA learning curves
    if args.pca_learning_curves:
        train_scores, test_scores = helpers.rf_accuracy_by_pca(model, X, y, to_scale=['cabin_count', 'age', 'fare', 'family_size'])
        plot = helpers.plot_learning_curves(title="PCA and Error", train_scores=train_scores, test_scores=test_scores, with_variance=False, x_label="Variance (in %)")
        plot.savefig('./figures/Accuracy_PCA_learning_curves.png')

    # 4. Feature importance learning curves
    if args.feature_importance_learning_curves:
        train_scores, test_scores = helpers.rf_accuracy_by_feature_importance(model, X, y)
        plot = helpers.plot_learning_curves(title="Feature importance and Error", train_scores=train_scores, test_scores=test_scores, with_variance=False, x_label="Feature importance (in %)")
        plot.savefig('./figures/Accuracy_feature_importance_learning_curves.png')


    # 5. Numer of trees learning curves
    if args.estimators_learning_curves:
        train_scores, test_scores = helpers.rf_accuracy_by_n_estimator(model, X, y)
        plot = helpers.plot_learning_curves(title="Number of trees and Error", train_scores=train_scores, test_scores=test_scores, with_variance=False, x_label="Number of trees")
        plot.savefig('./figures/Accuracy_n_estimator_learning_curves.png')

    # 6. Grid search parameters
    if args.grid_search:
        test_model = RandomForestClassifier(n_estimators=80, random_state=123)
        parameters = {'criterion': ['gini', 'entropy'],
                      'max_features': [.2, .5, .8, 'auto', None],
                      'max_depth': [3, 5, 10, 15, 20, None],
                      'min_samples_leaf': [1, 2, 5]
                      }
        grid_search = GridSearchCV(test_model, parameters, verbose=1)
        grid_search.fit(X, y)
        # print report
        # http://scikit-learn.org/stable/auto_examples/randomized_search.html
        top_scores = sorted(grid_search.grid_scores_, key=itemgetter(1), reverse=True)[:10]
        for i, score in enumerate(top_scores):
            print("Model with rank: {0}".format(i + 1))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                score.mean_validation_score,
                np.std(score.cv_validation_scores)
            ))
            print("Parameters: {0}".format(score.parameters))
            print("")

        # print learning curves for the best 5 models
        for i, score in enumerate(top_scores):
            best_model = RandomForestClassifier(n_estimators=80,
                                                oob_score=True,
                                                random_state=123,
                                                criterion=score.parameters['criterion'],
                                                max_features=score.parameters['max_features'],
                                                max_depth=score.parameters['max_depth'],
                                                min_samples_leaf=score.parameters['min_samples_leaf']
                                                )
            train_scores, test_scores = helpers.rf_accuracy_by_sample_size(best_model, X, y, n_iter=10)
            plot = helpers.plot_learning_curves(title="Model " + str(i+1) + " Errors", train_scores=train_scores, test_scores=test_scores, with_variance=True, x_label="Observations")
            plot.savefig('./figures/Model_' + str(i + 1) + '_Accuracy_baseline.png')

    # 7. Kaggle submission for best model as determined by the grid search
    if args.best_model_to_kaggle:

        # this is our best model
        best_model = RandomForestClassifier(n_estimators=80, max_features=None, criterion="entropy", max_depth=10, min_samples_leaf=1, random_state=123)
        best_model.fit(X, y)
        print "Generating Kaggle submission"
        kaggle_set = pd.read_csv(args.kaggle_test_set, sep=',')
        # store the passengers Ids
        passengers_ids = pd.Series(kaggle_set['PassengerId'])
        kaggle_set.drop('PassengerId', axis=1, inplace=1)
        kaggle_pred = pd.Series(best_model.predict(kaggle_set), name='Survived')
        result = pd.concat([passengers_ids, kaggle_pred], axis=1)
        # save to csv
        result.to_csv(os.path.join('.', 'kaggle', 'best_model.csv'), sep=',', encoding='utf-8', index=False)
def main():
    args = get_args()
    print("args : ", args)

    # Fix seed
    if args.seed is not None:
        torch.manual_seed(random_seed)
        torch.cuda.manual_seed(random_seed)
        torch.cuda.manual_seed_all(random_seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(random_seed)
        random.seed(random_seed)
        warnings.warn(
            'You have chosen to seed training. '
            'This will turn on the CUDNN deterministic setting, '
            'which can slow down your training considerably! '
            'You may see unexpected behavior when restarting from checkpoints.'
        )

    assert args.crop_size[0] <= args.train_size[0] and args.crop_size[1] <= args.train_size[1], \
    'Must be Crop size <= Image Size.'

    # Create directory to store run files
    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path + '/images')
    if not os.path.isdir(args.save_path + '/results_color_val'):
        os.makedirs(args.save_path + '/results_color_val')
        os.makedirs(args.save_path + '/results_color_test')

    Dataset = MiniCity

    dataloaders = get_dataloader(Dataset, args)
    criterion = get_lossfunc(Dataset, args)
    model = get_model(Dataset, args)

    print(model)

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr_init,
                                momentum=args.lr_momentum,
                                weight_decay=args.lr_weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     T_max=args.epochs)

    # Initialize metrics
    best_miou = 0.0
    metrics = {
        'train_loss': [],
        'train_acc': [],
        'val_acc': [],
        'val_loss': [],
        'miou': []
    }
    start_epoch = 0

    # Resume training from checkpoint
    if args.weights:
        print('Resuming training from {}.'.format(args.weights))
        checkpoint = torch.load(args.weights)
        model.load_state_dict(checkpoint['model_state_dict'], strict=True)
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        metrics = checkpoint['metrics']
        best_miou = checkpoint['best_miou']
        start_epoch = checkpoint['epoch'] + 1

    # Push model to GPU
    if torch.cuda.is_available():
        model = torch.nn.DataParallel(model).cuda()
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), torch.cuda.get_device_name(0)))

    # No training, only running prediction on test set
    if args.predict:
        checkpoint = torch.load(args.save_path + '/best_weights.pth.tar')
        model.load_state_dict(checkpoint['model_state_dict'], strict=True)
        print('Loaded model weights from {}'.format(args.save_path +
                                                    '/best_weights.pth.tar'))
        # Create results directory
        if not os.path.isdir(args.save_path + '/results_val'):
            os.makedirs(args.save_path + '/results_val')
        if not os.path.isdir(args.save_path + '/results_test'):
            os.makedirs(args.save_path + '/results_test')

        predict(dataloaders['test'],
                model,
                Dataset.mask_colors,
                folder=args.save_path,
                mode='test',
                args=args)
        predict(dataloaders['val'],
                model,
                Dataset.mask_colors,
                folder=args.save_path,
                mode='val',
                args=args)
        return

    # Generate log file
    with open(args.save_path + '/log_epoch.csv', 'a') as epoch_log:
        epoch_log.write(
            'epoch, train loss, val loss, train acc, val acc, miou\n')

    since = time.time()

    for epoch in range(start_epoch, args.epochs):
        # Train
        print('--- Training ---')
        train_loss, train_acc = train_epoch(dataloaders['train'],
                                            model,
                                            criterion,
                                            optimizer,
                                            scheduler,
                                            epoch,
                                            void=Dataset.voidClass,
                                            args=args)
        metrics['train_loss'].append(train_loss)
        metrics['train_acc'].append(train_acc)
        print('Epoch {} train loss: {:.4f}, acc: {:.4f}'.format(
            epoch, train_loss, train_acc))

        # Validate
        print('--- Validation ---')
        val_acc, val_loss, miou = validate_epoch(
            dataloaders['val'],
            model,
            criterion,
            epoch,
            Dataset.classLabels,
            Dataset.validClasses,
            void=Dataset.voidClass,
            maskColors=Dataset.mask_colors,
            folder=args.save_path,
            args=args)
        metrics['val_acc'].append(val_acc)
        metrics['val_loss'].append(val_loss)
        metrics['miou'].append(miou)

        # Write logs
        with open(args.save_path + '/log_epoch.csv', 'a') as epoch_log:
            epoch_log.write(
                '{}, {:.5f}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n'.format(
                    epoch, train_loss, val_loss, train_acc, val_acc, miou))

        # Save checkpoint
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_miou': best_miou,
                'metrics': metrics,
            }, args.save_path + '/checkpoint.pth.tar')

        # Save best model to file
        if miou > best_miou:
            print('mIoU improved from {:.4f} to {:.4f}.'.format(
                best_miou, miou))
            best_miou = miou
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                }, args.save_path + '/best_weights.pth.tar')

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    plot_learning_curves(metrics, args)

    # Load best model
    checkpoint = torch.load(args.save_path + '/best_weights.pth.tar')
    model.load_state_dict(checkpoint['model_state_dict'], strict=True)
    print('Loaded best model weights (epoch {}) from {}/best_weights.pth.tar'.
          format(checkpoint['epoch'], args.save_path))

    # Create results directory
    if not os.path.isdir(args.save_path + '/results_val'):
        os.makedirs(args.save_path + '/results_val')

    if not os.path.isdir(args.save_path + '/results_test'):
        os.makedirs(args.save_path + '/results_test')

    # Run prediction on validation set. For predicting on test set, simple replace 'val' by 'test'
    predict(dataloaders['val'],
            model,
            Dataset.mask_colors,
            folder=args.save_path,
            mode='val',
            args=args)
Esempio n. 3
0
def main():
    # widen pd output for debugging
    pd.set_option('display.width', 1000)

    parser = argparse.ArgumentParser()
    parser.add_argument("--input", help="Input file to parse (CSV)")
    parser.add_argument("--kaggle-test-file", help="Kaggle test set preprocessed (CSV)")
    parser.add_argument("--baseline", help="Train a baseline model and report error", action="store_true")
    parser.add_argument("--baseline-learning-curves", help="Build baseline learning curves with f1 score", action="store_true")
    parser.add_argument("--export-test", help="Export baseline predictions to CSV", action="store_true")
    parser.add_argument("--grid-search", help="Execute grid search on SVM parameters", action="store_true")
    args = parser.parse_args()


    if args.input is None:
        parser.print_help()
        sys.exit()
    input_file = args.input

    # set a random seed
    np.random.seed = 123

    # load data
    df = pd.read_csv(input_file, sep=',')


    # split X and y
    y = df['Survived']
    X = df.drop('Survived', axis=1)


    # prepare a test set with 1/4 of the data
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=123)

    # scale features
    scaler = StandardScaler()
    to_scale = ['age', 'cabin_count', 'family_size', 'fare']
    scaler.fit(X_train[to_scale])
    X_train.loc[:, to_scale] = scaler.transform(X_train[to_scale])
    X_test.loc[:, to_scale] = scaler.transform(X_test[to_scale])

    # we will use this model for our analysis
    model = svm.SVC(random_state=123, verbose=True)

    # 1. Establish a baseline
    if args.baseline:
        """
        train a simple SVM model and get the scores
        """
        model.fit(X_train, y_train)
        y_pred = pd.Series(model.predict(X_test), name='Survived')
        print model.score(X_test, y_test)
        print classification_report(y_test, y_pred)
        # run on the kaggle test set if provided
        if args.kaggle_test_file:
            kaggle_set = pd.read_csv(args.kaggle_test_file, sep=',')
            # store the passengers Ids
            passengers_ids = pd.Series(kaggle_set['PassengerId'])
            kaggle_set.drop('PassengerId', axis=1, inplace=1)
            # scale
            kaggle_set.loc[:, to_scale] = scaler.transform(kaggle_set[to_scale])
            kaggle_pred = pd.Series(model.predict(kaggle_set), name='Survived')
            result = pd.concat([passengers_ids, kaggle_pred], axis=1)
            # save to csv
            result.to_csv(os.path.join('.', 'kaggle', 'baseline_svm.csv'), sep=',', encoding='utf-8', index=False)

        if args.export_test:
            # change the name of the new column
            y_pred.name = "Predicted"
            result = pd.concat([X_test.reset_index(drop=True),
                                y_test.reset_index(drop=True),
                                y_pred.reset_index(drop=True)], axis=1)
            # save to cv
            result.to_csv(os.path.join('.', 'predictions', 'baseline_svm_predictions.csv'), sep=',', encoding='utf-8', index=False)


    # 2. Baseline learning curves
    if args.baseline_learning_curves:
        model = svm.SVC(random_state=123, verbose=True, C=1.4, kernel='poly')
        cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=100, test_size=0.2, random_state=123)
        plot = helpers.plot_learning_curves_cv(model, X_train, y_train, cv=cv, n_jobs=4)
        plot.savefig('./figures/svm_cv_F1_baseline.png')

    # 3. Grid search parameters
    if args.grid_search:
        test_model = svm.SVC(random_state=123)
        parameters = {'C': np.linspace(.1, 2, 10),
                      'kernel': ['rbf', 'poly'],
                      'class_weight': ['auto', None]
                      }
        grid_search = GridSearchCV(test_model, parameters, verbose=1)
        grid_search.fit(X, y)
        # print report
        # http://scikit-learn.org/stable/auto_examples/randomized_search.html
        top_scores = sorted(grid_search.grid_scores_, key=itemgetter(1), reverse=True)[:10]
        for i, score in enumerate(top_scores):
            print("Model with rank: {0}".format(i + 1))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                score.mean_validation_score,
                np.std(score.cv_validation_scores)
            ))
            print("Parameters: {0}".format(score.parameters))
            print("")

        # print learning curves for the best 5 models
        for i, score in enumerate(top_scores):
            best_model = svm.SVC(C=score.parameters['C'],
                                 kernel=score.parameters['kernel'],
                                 class_weight=score.parameters['class_weight'],
                                 random_state=123,
                                 )
            train_scores, test_scores = helpers.f1_scores_by_sample_size(best_model, X_train, y_train, X_test, y_test, n_iter=10)
            plot = helpers.plot_learning_curves(title="Model " + str(i+1) + " Errors", train_scores=train_scores, test_scores=test_scores, with_variance=True, x_label="Observations")
            plot.savefig('./figures/Model_SVM_' + str(i + 1) + '_Accuracy_baseline.png')