def main(): print('loading data') train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_' + FEATURE_NAME + '.csv') print('... train') train = pd.read_csv(train_features_path, nrows=TRAINING_PARAMS['nrows']) idx_split = int( (1 - VALIDATION_PARAMS['validation_fraction']) * len(train)) train, valid = train[:idx_split], train[idx_split:] train = sample_negative_class( train, fraction=TRAINING_PARAMS['negative_sample_fraction'], seed=TRAINING_PARAMS['negative_sample_seed']) @skopt.utils.use_named_args(SPACE) def objective(**params): model_params = {**params, **STATIC_PARAMS} valid_preds = fit_predict(train, valid, None, model_params, TRAINING_PARAMS, fine_tuning=True) valid_auc = roc_auc_score(valid['isFraud'], valid_preds) return -1.0 * valid_auc experiment_params = { **STATIC_PARAMS, **TRAINING_PARAMS, **HPO_PARAMS, } with neptune.create_experiment(name='skopt forest sweep', params=experiment_params, tags=['skopt', 'forest', 'tune'], upload_source_files=get_filepaths()): print('logging data version') log_data_version(train_features_path, prefix='train_features_') results = skopt.forest_minimize(objective, SPACE, callback=[sk_utils.NeptuneMonitor()], **HPO_PARAMS) best_auc = -1.0 * results.fun best_params = results.x neptune.send_metric('valid_auc', best_auc) neptune.set_property('best_parameters', str(best_params)) sk_utils.send_best_parameters(results) sk_utils.send_plot_convergence(results, channel_name='diagnostics_hpo') sk_utils.send_plot_evaluations(results, channel_name='diagnostics_hpo') sk_utils.send_plot_objective(results, channel_name='diagnostics_hpo')
def main(): print('started experimnent') with neptune.create_experiment( name='feature engineering', tags=['feature-extraction', FEATURE_NAME], upload_source_files=get_filepaths(), properties={'feature_version': FEATURE_NAME}): print('loading data') train = load_and_merge(RAW_DATA_PATH, 'train', NROWS)[ID_COLS + V1_COLS + ['isFraud']] test = load_and_merge(RAW_DATA_PATH, 'test', NROWS)[ID_COLS + V1_COLS] categorical_cols = set(V1_CAT_COLS) print('cleaning data') email_cols = ['P_emaildomain', 'R_emaildomain'] train, new_email_cols = clean_email(train, email_cols) test, _ = clean_email(test, email_cols) categorical_cols.update(new_email_cols) for col in email_cols: categorical_cols.remove(col) categorical_cols = list(categorical_cols) neptune.set_property('categorical_columns', str(categorical_cols)) print('encoding categoricals') encoder = OrdinalEncoder(cols=categorical_cols).fit( train[ID_COLS + categorical_cols]) train[ID_COLS + categorical_cols] = encoder.transform( train[ID_COLS + categorical_cols]) test[ID_COLS + categorical_cols] = encoder.transform( test[ID_COLS + categorical_cols]) train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_{}.csv'.format(FEATURE_NAME)) print('saving train to {}'.format(train_features_path)) train.to_csv(train_features_path, index=None) log_data_version(train_features_path, prefix='train_features_') test_features_path = os.path.join( FEATURES_DATA_PATH, 'test_features_{}.csv'.format(FEATURE_NAME)) print('saving test to {}'.format(test_features_path)) test.to_csv(test_features_path, index=None) log_data_version(test_features_path, prefix='test_features_')
def main(): print('started experimnent') with neptune.create_experiment( name='feature engineering', tags=['feature-extraction', FEATURE_NAME], upload_source_files=get_filepaths(), properties={'feature_version': FEATURE_NAME}): cols_to_drop = V0_CAT_COLS for split_name in ['train', 'test']: print('processing {}'.format(split_name)) data = load_and_merge(RAW_DATA_PATH, split_name, NROWS) features = feature_engineering_v0(data) cols_to_drop.extend(get_cols_to_drop(features)) features = drop_existing_cols(features, cols_to_drop) features_path = os.path.join( FEATURES_DATA_PATH, '{}_features_{}.csv'.format(split_name, FEATURE_NAME)) features.to_csv(features_path, index=None) log_data_version(features_path, prefix='{}_features_'.format(split_name))
def __init__(self, project_name, params, train_model: modellib.MaskRCNN, inference_model: modellib.MaskRCNN, dataset: utils.Dataset, dataset_limit=None, verbose=1): super().__init__(train_model=train_model, inference_model=inference_model, dataset=dataset, dataset_limit=dataset_limit, verbose=verbose) neptune.init(project_name) neptune.create_experiment( project_name, params=params, upload_source_files=['detector.py', 'utils.py']) log_data_version(args.dataset) self.best_epoch = 0 self.best_mAP = 0 self.best_model = None
'n_estimators': 1500} # Load data train = pd.read_csv(TRAIN_PATH, nrows=NROWS) test = pd.read_csv(TEST_PATH, nrows=NROWS) feature_names = [col for col in train.columns if col not in ['isFraud']] X_train, y_train = train[feature_names], train['isFraud'] X_test, y_test = test[feature_names], test['isFraud'] # Start experiment neptune.init(PROJECT_NAME) neptune.create_experiment(name='lightGBM training', params=MODEL_PARAMS, upload_source_files=['train.py', 'environment.yaml']) log_data_version(TRAIN_PATH, prefix='train_') log_data_version(TEST_PATH, prefix='test_') # Train model model = lightgbm.LGBMClassifier(**MODEL_PARAMS) model.fit(X_train, y_train) # Evaluate model y_test_pred = model.predict_proba(X_test) log_binary_classification_metrics(y_test, y_test_pred) pickle_and_send_artifact((y_test, y_test_pred), 'test_predictions.pkl') neptune.stop()
def main(): print('loading data') train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_' + FEATURE_NAME + '.csv') test_features_path = os.path.join(FEATURES_DATA_PATH, 'test_features_' + FEATURE_NAME + '.csv') print('... train') train = pd.read_csv(train_features_path, nrows=TRAINING_PARAMS['nrows']) X = train.sort_values('TransactionDT').drop( ['isFraud', 'TransactionDT', 'TransactionID'], axis=1) y = train.sort_values('TransactionDT')['isFraud'] train = train[["TransactionDT", 'TransactionID']] print('... test') test = pd.read_csv(test_features_path, nrows=TRAINING_PARAMS['nrows']) X_test = test.sort_values('TransactionDT').drop( ['TransactionDT', 'TransactionID'], axis=1) test = test[["TransactionDT", 'TransactionID']] folds = KFold(n_splits=VALIDATION_PARAMS['n_splits'], random_state=VALIDATION_PARAMS['validation_seed']) hyperparams = {**MODEL_PARAMS, **TRAINING_PARAMS, **VALIDATION_PARAMS} print('starting experiment') with neptune.create_experiment( name='model training', params=hyperparams, upload_source_files=get_filepaths(), tags=[MODEL_NAME, 'features_'.format(FEATURE_NAME), 'training']): print('logging data version') log_data_version(train_features_path, prefix='train_features_') log_data_version(test_features_path, prefix='test_features_') print('training') in_fold, out_of_fold, test_preds = fit_predict(X, y, X_test, folds, MODEL_PARAMS, TRAINING_PARAMS) print('logging metrics') train_auc, valid_auc = roc_auc_score(y, in_fold), roc_auc_score( y, out_of_fold) neptune.send_metric('train_auc', train_auc) neptune.send_metric('valid_auc', valid_auc) send_binary_classification_report( y, fmt_preds(out_of_fold), channel_name='valid_classification_report') print('postprocessing predictions') train_predictions_path = os.path.join( PREDICTION_DATA_PATH, 'train_prediction_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME)) test_predictions_path = os.path.join( PREDICTION_DATA_PATH, 'test_prediction_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME)) submission_path = os.path.join( PREDICTION_DATA_PATH, 'submission_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME)) submission = pd.read_csv(SAMPLE_SUBMISSION_PATH) train = pd.concat( [train, pd.DataFrame(out_of_fold, columns=['prediction'])], axis=1) test = pd.concat( [test, pd.DataFrame(test_preds, columns=['prediction'])], axis=1) submission['isFraud'] = pd.merge(submission, test, on='TransactionID')['prediction'] train.to_csv(train_predictions_path, index=None) test.to_csv(test_predictions_path, index=None) submission.to_csv(submission_path, index=None) neptune.send_artifact(train_predictions_path) neptune.send_artifact(test_predictions_path) neptune.send_artifact(submission_path) print('experiment finished')