def main():
    print('loading data')
    train_features_path = os.path.join(
        FEATURES_DATA_PATH, 'train_features_' + FEATURE_NAME + '.csv')

    print('... train')
    train = pd.read_csv(train_features_path, nrows=TRAINING_PARAMS['nrows'])

    idx_split = int(
        (1 - VALIDATION_PARAMS['validation_fraction']) * len(train))
    train, valid = train[:idx_split], train[idx_split:]

    train = sample_negative_class(
        train,
        fraction=TRAINING_PARAMS['negative_sample_fraction'],
        seed=TRAINING_PARAMS['negative_sample_seed'])

    @skopt.utils.use_named_args(SPACE)
    def objective(**params):
        model_params = {**params, **STATIC_PARAMS}
        valid_preds = fit_predict(train,
                                  valid,
                                  None,
                                  model_params,
                                  TRAINING_PARAMS,
                                  fine_tuning=True)
        valid_auc = roc_auc_score(valid['isFraud'], valid_preds)
        return -1.0 * valid_auc

    experiment_params = {
        **STATIC_PARAMS,
        **TRAINING_PARAMS,
        **HPO_PARAMS,
    }

    with neptune.create_experiment(name='skopt forest sweep',
                                   params=experiment_params,
                                   tags=['skopt', 'forest', 'tune'],
                                   upload_source_files=get_filepaths()):
        print('logging data version')
        log_data_version(train_features_path, prefix='train_features_')

        results = skopt.forest_minimize(objective,
                                        SPACE,
                                        callback=[sk_utils.NeptuneMonitor()],
                                        **HPO_PARAMS)
        best_auc = -1.0 * results.fun
        best_params = results.x

        neptune.send_metric('valid_auc', best_auc)
        neptune.set_property('best_parameters', str(best_params))

        sk_utils.send_best_parameters(results)
        sk_utils.send_plot_convergence(results, channel_name='diagnostics_hpo')
        sk_utils.send_plot_evaluations(results, channel_name='diagnostics_hpo')
        sk_utils.send_plot_objective(results, channel_name='diagnostics_hpo')
Exemple #2
0
def main():
    print('started experimnent')
    with neptune.create_experiment(
            name='feature engineering',
            tags=['feature-extraction', FEATURE_NAME],
            upload_source_files=get_filepaths(),
            properties={'feature_version': FEATURE_NAME}):
        print('loading data')
        train = load_and_merge(RAW_DATA_PATH, 'train',
                               NROWS)[ID_COLS + V1_COLS + ['isFraud']]
        test = load_and_merge(RAW_DATA_PATH, 'test', NROWS)[ID_COLS + V1_COLS]

        categorical_cols = set(V1_CAT_COLS)
        print('cleaning data')
        email_cols = ['P_emaildomain', 'R_emaildomain']
        train, new_email_cols = clean_email(train, email_cols)
        test, _ = clean_email(test, email_cols)

        categorical_cols.update(new_email_cols)
        for col in email_cols:
            categorical_cols.remove(col)
        categorical_cols = list(categorical_cols)
        neptune.set_property('categorical_columns', str(categorical_cols))

        print('encoding categoricals')
        encoder = OrdinalEncoder(cols=categorical_cols).fit(
            train[ID_COLS + categorical_cols])
        train[ID_COLS + categorical_cols] = encoder.transform(
            train[ID_COLS + categorical_cols])
        test[ID_COLS + categorical_cols] = encoder.transform(
            test[ID_COLS + categorical_cols])

        train_features_path = os.path.join(
            FEATURES_DATA_PATH, 'train_features_{}.csv'.format(FEATURE_NAME))
        print('saving train to {}'.format(train_features_path))
        train.to_csv(train_features_path, index=None)
        log_data_version(train_features_path, prefix='train_features_')

        test_features_path = os.path.join(
            FEATURES_DATA_PATH, 'test_features_{}.csv'.format(FEATURE_NAME))
        print('saving test to {}'.format(test_features_path))
        test.to_csv(test_features_path, index=None)
        log_data_version(test_features_path, prefix='test_features_')
def main():
    print('started experimnent')
    with neptune.create_experiment(
            name='feature engineering',
            tags=['feature-extraction', FEATURE_NAME],
            upload_source_files=get_filepaths(),
            properties={'feature_version': FEATURE_NAME}):
        cols_to_drop = V0_CAT_COLS
        for split_name in ['train', 'test']:
            print('processing {}'.format(split_name))
            data = load_and_merge(RAW_DATA_PATH, split_name, NROWS)
            features = feature_engineering_v0(data)
            cols_to_drop.extend(get_cols_to_drop(features))
            features = drop_existing_cols(features, cols_to_drop)
            features_path = os.path.join(
                FEATURES_DATA_PATH,
                '{}_features_{}.csv'.format(split_name, FEATURE_NAME))
            features.to_csv(features_path, index=None)
            log_data_version(features_path,
                             prefix='{}_features_'.format(split_name))
    def __init__(self,
                 project_name,
                 params,
                 train_model: modellib.MaskRCNN,
                 inference_model: modellib.MaskRCNN,
                 dataset: utils.Dataset,
                 dataset_limit=None,
                 verbose=1):
        super().__init__(train_model=train_model,
                         inference_model=inference_model,
                         dataset=dataset,
                         dataset_limit=dataset_limit,
                         verbose=verbose)

        neptune.init(project_name)
        neptune.create_experiment(
            project_name,
            params=params,
            upload_source_files=['detector.py', 'utils.py'])
        log_data_version(args.dataset)
        self.best_epoch = 0
        self.best_mAP = 0
        self.best_model = None
Exemple #5
0
                'n_estimators': 1500}

# Load data
train = pd.read_csv(TRAIN_PATH, nrows=NROWS)
test = pd.read_csv(TEST_PATH, nrows=NROWS)

feature_names = [col for col in train.columns if col not in ['isFraud']]

X_train, y_train = train[feature_names], train['isFraud']
X_test, y_test = test[feature_names], test['isFraud']

# Start experiment
neptune.init(PROJECT_NAME)
neptune.create_experiment(name='lightGBM training',
                          params=MODEL_PARAMS,
                          upload_source_files=['train.py', 'environment.yaml'])
log_data_version(TRAIN_PATH, prefix='train_')
log_data_version(TEST_PATH, prefix='test_')

# Train model
model = lightgbm.LGBMClassifier(**MODEL_PARAMS)
model.fit(X_train, y_train)

# Evaluate model
y_test_pred = model.predict_proba(X_test)

log_binary_classification_metrics(y_test, y_test_pred)
pickle_and_send_artifact((y_test, y_test_pred), 'test_predictions.pkl')

neptune.stop()
def main():
    print('loading data')
    train_features_path = os.path.join(
        FEATURES_DATA_PATH, 'train_features_' + FEATURE_NAME + '.csv')
    test_features_path = os.path.join(FEATURES_DATA_PATH,
                                      'test_features_' + FEATURE_NAME + '.csv')

    print('... train')
    train = pd.read_csv(train_features_path, nrows=TRAINING_PARAMS['nrows'])
    X = train.sort_values('TransactionDT').drop(
        ['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
    y = train.sort_values('TransactionDT')['isFraud']
    train = train[["TransactionDT", 'TransactionID']]

    print('... test')
    test = pd.read_csv(test_features_path, nrows=TRAINING_PARAMS['nrows'])
    X_test = test.sort_values('TransactionDT').drop(
        ['TransactionDT', 'TransactionID'], axis=1)
    test = test[["TransactionDT", 'TransactionID']]

    folds = KFold(n_splits=VALIDATION_PARAMS['n_splits'],
                  random_state=VALIDATION_PARAMS['validation_seed'])

    hyperparams = {**MODEL_PARAMS, **TRAINING_PARAMS, **VALIDATION_PARAMS}

    print('starting experiment')
    with neptune.create_experiment(
            name='model training',
            params=hyperparams,
            upload_source_files=get_filepaths(),
            tags=[MODEL_NAME, 'features_'.format(FEATURE_NAME), 'training']):
        print('logging data version')
        log_data_version(train_features_path, prefix='train_features_')
        log_data_version(test_features_path, prefix='test_features_')

        print('training')
        in_fold, out_of_fold, test_preds = fit_predict(X, y, X_test, folds,
                                                       MODEL_PARAMS,
                                                       TRAINING_PARAMS)

        print('logging metrics')
        train_auc, valid_auc = roc_auc_score(y, in_fold), roc_auc_score(
            y, out_of_fold)
        neptune.send_metric('train_auc', train_auc)
        neptune.send_metric('valid_auc', valid_auc)
        send_binary_classification_report(
            y,
            fmt_preds(out_of_fold),
            channel_name='valid_classification_report')

        print('postprocessing predictions')
        train_predictions_path = os.path.join(
            PREDICTION_DATA_PATH,
            'train_prediction_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME))
        test_predictions_path = os.path.join(
            PREDICTION_DATA_PATH,
            'test_prediction_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME))
        submission_path = os.path.join(
            PREDICTION_DATA_PATH,
            'submission_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME))
        submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)

        train = pd.concat(
            [train, pd.DataFrame(out_of_fold, columns=['prediction'])], axis=1)
        test = pd.concat(
            [test, pd.DataFrame(test_preds, columns=['prediction'])], axis=1)
        submission['isFraud'] = pd.merge(submission, test,
                                         on='TransactionID')['prediction']
        train.to_csv(train_predictions_path, index=None)
        test.to_csv(test_predictions_path, index=None)
        submission.to_csv(submission_path, index=None)
        neptune.send_artifact(train_predictions_path)
        neptune.send_artifact(test_predictions_path)
        neptune.send_artifact(submission_path)
        print('experiment finished')