def train_evaluate_cv(): meta = pd.read_csv(PARAMS.metadata_filepath) if DEV_MODE: meta = meta.sample(PARAMS.dev_mode_size, random_state=SEED) meta_train = meta[meta['is_train'] == 1] with neptune.create_experiment(name=EXPERIMENT_NAME, params=PARAMS, tags=TAGS + ['train', 'evaluate', 'on_cv_folds'], upload_source_files=get_filepaths(), properties={'experiment_dir': EXPERIMENT_DIR}): cv = utils.KFoldBySortedValue(n_splits=PARAMS.n_cv_splits, shuffle=PARAMS.shuffle, random_state=SEED) fold_auc = [] for fold_id, (train_idx, valid_idx) in enumerate(cv.split(meta_train[DEPTH_COLUMN].values.reshape(-1))): train_data_split, valid_data_split = meta_train.iloc[train_idx], meta_train.iloc[valid_idx] if USE_AUXILIARY_DATA: auxiliary = pd.read_csv(PARAMS.auxiliary_metadata_filepath) train_auxiliary = auxiliary[auxiliary[ID_COLUMN].isin(valid_data_split[ID_COLUMN].tolist())] train_data_split = pd.concat([train_data_split, train_auxiliary], axis=0) LOGGER.info('Started fold {}'.format(fold_id)) auc, _ = fold_fit_evaluate_loop(train_data_split, valid_data_split, fold_id) LOGGER.info('Fold {} AUC {}'.format(fold_id, auc)) neptune.send_metric('Fold {} AUC'.format(fold_id), auc) fold_auc.append(auc) auc_mean, auc_std = np.mean(fold_auc), np.std(fold_auc) log_scores(auc_mean, auc_std)
def evaluate_cv(): meta = pd.read_csv(PARAMS.metadata_filepath) if DEV_MODE: meta = meta.sample(PARAMS.dev_mode_size, random_state=SEED) meta_train = meta[meta['is_train'] == 1] with neptune.create_experiment(name=EXPERIMENT_NAME, params=PARAMS, tags=TAGS + ['evaluate', 'on_cv_folds'], upload_source_files=get_filepaths(), properties={'experiment_dir': EXPERIMENT_DIR}): cv = utils.KFoldBySortedValue(n_splits=PARAMS.n_cv_splits, shuffle=PARAMS.shuffle, random_state=SEED) fold_iou, fold_iout = [], [] for fold_id, (train_idx, valid_idx) in enumerate(cv.split(meta_train[DEPTH_COLUMN].values.reshape(-1))): valid_data_split = meta_train.iloc[valid_idx] LOGGER.info('Started fold {}'.format(fold_id)) iou, iout, _ = fold_evaluate_loop(valid_data_split, fold_id) LOGGER.info('Fold {} IOU {}'.format(fold_id, iou)) neptune.send_metric('Fold {} IOU'.format(fold_id), iou) LOGGER.info('Fold {} IOUT {}'.format(fold_id, iout)) neptune.send_metric('Fold {} IOUT'.format(fold_id), iout) fold_iou.append(iou) fold_iout.append(iout) iou_mean, iou_std = np.mean(fold_iou), np.std(fold_iou) iout_mean, iout_std = np.mean(fold_iout), np.std(fold_iout) log_scores(iou_mean, iou_std, iout_mean, iout_std)
def train(): meta = pd.read_csv(PARAMS.metadata_filepath) meta_train = meta[meta['is_train'] == 1] cv = utils.KFoldBySortedValue(n_splits=PARAMS.n_cv_splits, shuffle=PARAMS.shuffle, random_state=SEED) for train_idx, valid_idx in cv.split(meta_train[DEPTH_COLUMN].values.reshape(-1)): break meta_train_split, meta_valid_split = meta_train.iloc[train_idx], meta_train.iloc[valid_idx] if USE_AUXILIARY_DATA: auxiliary = pd.read_csv(PARAMS.auxiliary_metadata_filepath) train_auxiliary = auxiliary[auxiliary[ID_COLUMN].isin(meta_valid_split[ID_COLUMN].tolist())] meta_train_split = pd.concat([meta_train_split, train_auxiliary], axis=0) if DEV_MODE: meta_train_split = meta_train_split.sample(PARAMS.dev_mode_size, random_state=SEED) meta_valid_split = meta_valid_split.sample(int(PARAMS.dev_mode_size / 2), random_state=SEED) with neptune.create_experiment(name=EXPERIMENT_NAME, params=PARAMS, tags=TAGS + ['train'], upload_source_files=get_filepaths(), properties={'experiment_dir': EXPERIMENT_DIR}): data = {'input': {'meta': meta_train_split }, 'callback_input': {'meta_valid': meta_valid_split } } pipeline_network = network(config=CONFIG, train_mode=True) pipeline_network.clean_cache() pipeline_network.fit_transform(data) pipeline_network.clean_cache()
def main(): print('loading data') train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_' + FEATURE_NAME + '.csv') print('... train') train = pd.read_csv(train_features_path, nrows=TRAINING_PARAMS['nrows']) idx_split = int( (1 - VALIDATION_PARAMS['validation_fraction']) * len(train)) train, valid = train[:idx_split], train[idx_split:] train = sample_negative_class( train, fraction=TRAINING_PARAMS['negative_sample_fraction'], seed=TRAINING_PARAMS['negative_sample_seed']) @skopt.utils.use_named_args(SPACE) def objective(**params): model_params = {**params, **STATIC_PARAMS} valid_preds = fit_predict(train, valid, None, model_params, TRAINING_PARAMS, fine_tuning=True) valid_auc = roc_auc_score(valid['isFraud'], valid_preds) return -1.0 * valid_auc experiment_params = { **STATIC_PARAMS, **TRAINING_PARAMS, **HPO_PARAMS, } with neptune.create_experiment(name='skopt forest sweep', params=experiment_params, tags=['skopt', 'forest', 'tune'], upload_source_files=get_filepaths()): print('logging data version') log_data_version(train_features_path, prefix='train_features_') results = skopt.forest_minimize(objective, SPACE, callback=[sk_utils.NeptuneMonitor()], **HPO_PARAMS) best_auc = -1.0 * results.fun best_params = results.x neptune.send_metric('valid_auc', best_auc) neptune.set_property('best_parameters', str(best_params)) sk_utils.send_best_parameters(results) sk_utils.send_plot_convergence(results, channel_name='diagnostics_hpo') sk_utils.send_plot_evaluations(results, channel_name='diagnostics_hpo') sk_utils.send_plot_objective(results, channel_name='diagnostics_hpo')
def train_evaluate_predict_cv(): meta = pd.read_csv(PARAMS.metadata_filepath) if DEV_MODE: meta = meta.sample(PARAMS.dev_mode_size, random_state=SEED) meta_train = meta[meta['is_train'] == 1] meta_test = meta[meta['is_train'] == 0] with neptune.create_experiment(name=EXPERIMENT_NAME, params=PARAMS, tags=TAGS + ['train', 'evaluate', 'predict', 'on_cv_folds'], upload_source_files=get_filepaths(), properties={'experiment_dir': EXPERIMENT_DIR}): cv = utils.KFoldBySortedValue(n_splits=PARAMS.n_cv_splits, shuffle=PARAMS.shuffle, random_state=SEED) fold_iou, fold_iout, out_of_fold_train_predictions, out_of_fold_test_predictions = [], [], [], [] for fold_id, (train_idx, valid_idx) in enumerate(cv.split(meta_train[DEPTH_COLUMN].values.reshape(-1))): train_data_split, valid_data_split = meta_train.iloc[train_idx], meta_train.iloc[valid_idx] if USE_AUXILIARY_DATA: auxiliary = pd.read_csv(PARAMS.auxiliary_metadata_filepath) train_auxiliary = auxiliary[auxiliary[ID_COLUMN].isin(valid_data_split[ID_COLUMN].tolist())] train_data_split = pd.concat([train_data_split, train_auxiliary], axis=0) LOGGER.info('Started fold {}'.format(fold_id)) iou, iout, out_of_fold_prediction, test_prediction = fold_fit_evaluate_predict_loop(train_data_split, valid_data_split, meta_test, fold_id) LOGGER.info('Fold {} IOU {}'.format(fold_id, iou)) neptune.send_metric('Fold {} IOU'.format(fold_id), iou) LOGGER.info('Fold {} IOUT {}'.format(fold_id, iout)) neptune.send_metric('Fold {} IOUT'.format(fold_id), iout) fold_iou.append(iou) fold_iout.append(iout) out_of_fold_train_predictions.append(out_of_fold_prediction) out_of_fold_test_predictions.append(test_prediction) train_ids, train_predictions = [], [] for idx_fold, train_pred_fold in out_of_fold_train_predictions: train_ids.extend(idx_fold) train_predictions.extend(train_pred_fold) iou_mean, iou_std = np.mean(fold_iou), np.std(fold_iou) iout_mean, iout_std = np.mean(fold_iout), np.std(fold_iout) log_scores(iou_mean, iou_std, iout_mean, iout_std) save_predictions(train_ids, train_predictions, meta_test, out_of_fold_test_predictions)
def evaluate(): meta = pd.read_csv(PARAMS.metadata_filepath) meta_train = meta[meta['is_train'] == 1] cv = utils.KFoldBySortedValue(n_splits=PARAMS.n_cv_splits, shuffle=PARAMS.shuffle, random_state=SEED) for train_idx, valid_idx in cv.split(meta_train[DEPTH_COLUMN].values.reshape(-1)): break meta_valid_split = meta_train.iloc[valid_idx] y_true_valid = utils.read_masks(meta_valid_split[Y_COLUMN].values) if DEV_MODE: meta_valid_split = meta_valid_split.sample(PARAMS.dev_mode_size, random_state=SEED) data = {'input': {'meta': meta_valid_split, }, 'callback_input': {'meta_valid': None } } with neptune.create_experiment(name=EXPERIMENT_NAME, params=PARAMS, tags=TAGS + ['evaluate'], upload_source_files=get_filepaths(), properties={'experiment_dir': EXPERIMENT_DIR}): pipeline_network = network(config=CONFIG, train_mode=False) pipeline_postprocessing = pipelines.mask_postprocessing(config=CONFIG) pipeline_network.clean_cache() output = pipeline_network.transform(data) valid_masks = {'input_masks': output } output = pipeline_postprocessing.transform(valid_masks) pipeline_network.clean_cache() pipeline_postprocessing.clean_cache() y_pred_valid = output['binarized_images'] LOGGER.info('Calculating IOU and IOUT Scores') iou_score, iout_score = calculate_scores(y_true_valid, y_pred_valid) LOGGER.info('IOU score on validation is {}'.format(iou_score)) neptune.send_metric('IOU', iou_score) LOGGER.info('IOUT score on validation is {}'.format(iout_score)) neptune.send_metric('IOUT', iout_score) results_filepath = os.path.join(EXPERIMENT_DIR, 'validation_results.pkl') LOGGER.info('Saving validation results to {}'.format(results_filepath)) joblib.dump((meta_valid_split, y_true_valid, y_pred_valid), results_filepath)
def main(): print('started experimnent') with neptune.create_experiment( name='feature engineering', tags=['feature-extraction', FEATURE_NAME], upload_source_files=get_filepaths(), properties={'feature_version': FEATURE_NAME}): print('loading data') train = load_and_merge(RAW_DATA_PATH, 'train', NROWS)[ID_COLS + V1_COLS + ['isFraud']] test = load_and_merge(RAW_DATA_PATH, 'test', NROWS)[ID_COLS + V1_COLS] categorical_cols = set(V1_CAT_COLS) print('cleaning data') email_cols = ['P_emaildomain', 'R_emaildomain'] train, new_email_cols = clean_email(train, email_cols) test, _ = clean_email(test, email_cols) categorical_cols.update(new_email_cols) for col in email_cols: categorical_cols.remove(col) categorical_cols = list(categorical_cols) neptune.set_property('categorical_columns', str(categorical_cols)) print('encoding categoricals') encoder = OrdinalEncoder(cols=categorical_cols).fit( train[ID_COLS + categorical_cols]) train[ID_COLS + categorical_cols] = encoder.transform( train[ID_COLS + categorical_cols]) test[ID_COLS + categorical_cols] = encoder.transform( test[ID_COLS + categorical_cols]) train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_{}.csv'.format(FEATURE_NAME)) print('saving train to {}'.format(train_features_path)) train.to_csv(train_features_path, index=None) log_data_version(train_features_path, prefix='train_features_') test_features_path = os.path.join( FEATURES_DATA_PATH, 'test_features_{}.csv'.format(FEATURE_NAME)) print('saving test to {}'.format(test_features_path)) test.to_csv(test_features_path, index=None) log_data_version(test_features_path, prefix='test_features_')
def main(): print('started experimnent') with neptune.create_experiment( name='feature engineering', tags=['feature-extraction', FEATURE_NAME], upload_source_files=get_filepaths(), properties={'feature_version': FEATURE_NAME}): cols_to_drop = V0_CAT_COLS for split_name in ['train', 'test']: print('processing {}'.format(split_name)) data = load_and_merge(RAW_DATA_PATH, split_name, NROWS) features = feature_engineering_v0(data) cols_to_drop.extend(get_cols_to_drop(features)) features = drop_existing_cols(features, cols_to_drop) features_path = os.path.join( FEATURES_DATA_PATH, '{}_features_{}.csv'.format(split_name, FEATURE_NAME)) features.to_csv(features_path, index=None) log_data_version(features_path, prefix='{}_features_'.format(split_name))
def evaluate_predict_cv(): meta = pd.read_csv(PARAMS.metadata_filepath) if DEV_MODE: meta = meta.sample(PARAMS.dev_mode_size, random_state=SEED) meta_train = meta[meta['is_train'] == 1] meta_test = meta[meta['is_train'] == 0] with neptune.create_experiment(name=EXPERIMENT_NAME, params=PARAMS, tags=TAGS + ['evaluate', 'predict', 'on_cv_folds'], upload_source_files=get_filepaths(), properties={'experiment_dir': EXPERIMENT_DIR}): cv = utils.KFoldBySortedValue(n_splits=PARAMS.n_cv_splits, shuffle=PARAMS.shuffle, random_state=SEED) fold_auc, out_of_fold_train_predictions, out_of_fold_test_predictions = [], [], [] for fold_id, (train_idx, valid_idx) in enumerate(cv.split(meta_train[DEPTH_COLUMN].values.reshape(-1))): valid_data_split = meta_train.iloc[valid_idx] LOGGER.info('Started fold {}'.format(fold_id)) auc, out_of_fold_prediction, test_prediction = fold_evaluate_predict_loop(valid_data_split, meta_test, fold_id) LOGGER.info('Fold {} AUC {}'.format(fold_id, auc)) neptune.send_metric('Fold {} AUC'.format(fold_id), auc) fold_auc.append(auc) out_of_fold_train_predictions.append(out_of_fold_prediction) out_of_fold_test_predictions.append(test_prediction) train_ids, train_predictions = [], [] for idx_fold, train_pred_fold in out_of_fold_train_predictions: train_ids.extend(idx_fold) train_predictions.extend(train_pred_fold) auc_mean, auc_std = np.mean(fold_auc), np.std(fold_auc) log_scores(auc_mean, auc_std) save_predictions(train_ids, train_predictions, meta_test, out_of_fold_test_predictions)
def predict(): meta = pd.read_csv(PARAMS.metadata_filepath) meta_test = meta[meta['is_train'] == 0] if DEV_MODE: meta_test = meta_test.sample(PARAMS.dev_mode_size, random_state=SEED) data = {'input': {'meta': meta_test, }, 'callback_input': {'meta_valid': None } } with neptune.create_experiment(name=EXPERIMENT_NAME, params=PARAMS, tags=TAGS + ['predict'], upload_source_files=get_filepaths(), properties={'experiment_dir': EXPERIMENT_DIR}): pipeline_network = network(config=CONFIG, train_mode=False) pipeline_postprocessing = pipelines.mask_postprocessing(config=CONFIG) pipeline_network.clean_cache() predicted_masks = pipeline_network.transform(data) test_masks = {'input_masks': predicted_masks } output = pipeline_postprocessing.transform(test_masks) pipeline_network.clean_cache() pipeline_postprocessing.clean_cache() y_pred_test = output['binarized_images'] submission = utils.create_submission(meta_test, y_pred_test) submission_filepath = os.path.join(EXPERIMENT_DIR, 'submission.csv') submission.to_csv(submission_filepath, index=None, encoding='utf-8') LOGGER.info('submission saved to {}'.format(submission_filepath)) LOGGER.info('submission head \n\n{}'.format(submission.head()))
obverter_loss=entangled_loss, vision_module=Vision.from_pretrained( 'visual_data/vision_model.pth')) for _ in range(2) ] game = ObverterGame(agents=agents, max_len=2, vocab_size=opts.vocab_size, loss=entangled_loss) optimizer = torch.optim.Adam([{ 'params': agent.parameters(), 'lr': opts.lr } for agent in agents]) neptune.init(project_qualified_name='anonymous/anonymous', backend=neptune.OfflineBackend()) with neptune.create_experiment(params=vars(opts), upload_source_files=get_filepaths(), tags=['']) as experiment: trainer = core.Trainer( game=game, optimizer=optimizer, train_data=train_loader, validation_data=test_loader, callbacks=[ CompositionalityMetricObverter(full_dataset, agents[0], opts, opts.vocab_size, prefix='1_'), CompositionalityMetricObverter(full_dataset, agents[1], opts,
embed_dim=opts.sender_embedding, hidden_size=opts.sender_hidden, max_len=opts.max_len, temperature=3., trainable_temperature=True, force_eos=False, cell=opts.rnn_cell) receiver = RnnReceiverGS( agent=Receiver(opts.receiver_hidden, opts.n_features), vocab_size=opts.vocab_size, embed_dim=opts.receiver_embedding, hidden_size=opts.receiver_hidden, cell=opts.rnn_cell) neptune.init('anonymous/anonymous', backend=neptune.OfflineBackend()) with neptune.create_experiment(params=vars(opts), upload_source_files=get_filepaths(), tags=['']) as experiment: # Pretraining game if opts.pretrain: pretraining_game = PretrainingmGameGS(pretrained_senders, receiver, padding=opts.padding) sender_params = [{'params': sender.parameters(), 'lr': opts.pretraining_sender_lr} for sender in pretrained_senders] receiver_params = [{'params': receiver.parameters(), 'lr': opts.pretraining_receiver_lr}] optimizer = torch.optim.Adam(sender_params + receiver_params) trainer = core.Trainer( game=pretraining_game, optimizer=optimizer, train_data=train_loader, validation_data=test_loader, callbacks=[ CompositionalityMetricGS(full_dataset, pretrained_senders[0], opts, opts.vocab_size, prefix='1_'), CompositionalityMetricGS(full_dataset, pretrained_senders[1], opts, opts.vocab_size, prefix='2_'), NeptuneMonitor(prefix='pretrain'),
def main(): print('loading data') train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_' + FEATURE_NAME + '.csv') test_features_path = os.path.join(FEATURES_DATA_PATH, 'test_features_' + FEATURE_NAME + '.csv') print('... train') train = pd.read_csv(train_features_path, nrows=TRAINING_PARAMS['nrows']) X = train.sort_values('TransactionDT').drop( ['isFraud', 'TransactionDT', 'TransactionID'], axis=1) y = train.sort_values('TransactionDT')['isFraud'] train = train[["TransactionDT", 'TransactionID']] print('... test') test = pd.read_csv(test_features_path, nrows=TRAINING_PARAMS['nrows']) X_test = test.sort_values('TransactionDT').drop( ['TransactionDT', 'TransactionID'], axis=1) test = test[["TransactionDT", 'TransactionID']] folds = KFold(n_splits=VALIDATION_PARAMS['n_splits'], random_state=VALIDATION_PARAMS['validation_seed']) hyperparams = {**MODEL_PARAMS, **TRAINING_PARAMS, **VALIDATION_PARAMS} print('starting experiment') with neptune.create_experiment( name='model training', params=hyperparams, upload_source_files=get_filepaths(), tags=[MODEL_NAME, 'features_'.format(FEATURE_NAME), 'training']): print('logging data version') log_data_version(train_features_path, prefix='train_features_') log_data_version(test_features_path, prefix='test_features_') print('training') in_fold, out_of_fold, test_preds = fit_predict(X, y, X_test, folds, MODEL_PARAMS, TRAINING_PARAMS) print('logging metrics') train_auc, valid_auc = roc_auc_score(y, in_fold), roc_auc_score( y, out_of_fold) neptune.send_metric('train_auc', train_auc) neptune.send_metric('valid_auc', valid_auc) send_binary_classification_report( y, fmt_preds(out_of_fold), channel_name='valid_classification_report') print('postprocessing predictions') train_predictions_path = os.path.join( PREDICTION_DATA_PATH, 'train_prediction_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME)) test_predictions_path = os.path.join( PREDICTION_DATA_PATH, 'test_prediction_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME)) submission_path = os.path.join( PREDICTION_DATA_PATH, 'submission_{}_{}.csv'.format(FEATURE_NAME, MODEL_NAME)) submission = pd.read_csv(SAMPLE_SUBMISSION_PATH) train = pd.concat( [train, pd.DataFrame(out_of_fold, columns=['prediction'])], axis=1) test = pd.concat( [test, pd.DataFrame(test_preds, columns=['prediction'])], axis=1) submission['isFraud'] = pd.merge(submission, test, on='TransactionID')['prediction'] train.to_csv(train_predictions_path, index=None) test.to_csv(test_predictions_path, index=None) submission.to_csv(submission_path, index=None) neptune.send_artifact(train_predictions_path) neptune.send_artifact(test_predictions_path) neptune.send_artifact(submission_path) print('experiment finished')
trainable_temperature=True, force_eos=False, cell=opts.rnn_cell) receiver = RnnReceiverGS(agent=Receiver(opts.receiver_hidden, opts.n_features), vocab_size=opts.vocab_size, embed_dim=opts.receiver_embedding, hidden_size=opts.receiver_hidden, cell=opts.rnn_cell) neptune.init( project_qualified_name=opts.neptune_project or 'anonymous/anonymous', backend=neptune.OfflineBackend() if not opts.neptune_project else None) with neptune.create_experiment( params=vars(opts), upload_source_files=get_filepaths()) as experiment: # Pretraining game if not opts.no_transfer: if opts.reinforce: pretraining_game = PretrainingmGameReinforce( pretrained_senders, receiver) else: pretraining_game = PretrainingmGameGS(pretrained_senders, receiver) sender_params = [{ 'params': sender.parameters(), 'lr': opts.pretraining_sender_lr } for sender in pretrained_senders] receiver_params = [{ 'params': receiver.parameters(),