def _train(pipeline_name, dev_mode):
    if bool(params.overwrite) and os.path.isdir(params.experiment_dir):
        shutil.rmtree(params.experiment_dir)

    logger.info('reading data in')
    if dev_mode:
        TRAIN_DAYS, TRAIN_HOURS = cfg.DEV_TRAIN_DAYS, cfg.DEV_TRAIN_HOURS
        VALID_DAYS, VALID_HOURS = cfg.DEV_VALID_DAYS, cfg.DEV_VALID_HOURS
    else:
        TRAIN_DAYS, TRAIN_HOURS = eval(params.train_days), eval(
            params.train_hours)
        VALID_DAYS, VALID_HOURS = eval(params.valid_days), eval(
            params.valid_hours)

    meta_train_split = read_csv_time_chunks(params.train_chunks_dir,
                                            days=TRAIN_DAYS,
                                            hours=TRAIN_HOURS,
                                            usecols=cfg.FEATURE_COLUMNS +
                                            cfg.TARGET_COLUMNS,
                                            dtype=cfg.COLUMN_TYPES['train'],
                                            logger=logger)
    meta_valid_split = read_csv_time_chunks(params.train_chunks_dir,
                                            days=VALID_DAYS,
                                            hours=VALID_HOURS,
                                            usecols=cfg.FEATURE_COLUMNS +
                                            cfg.TARGET_COLUMNS,
                                            dtype=cfg.COLUMN_TYPES['train'],
                                            logger=logger)

    data_hash_channel_send(ctx, 'Training Data Hash', meta_train_split)
    data_hash_channel_send(ctx, 'Validation Data Hash', meta_valid_split)

    if dev_mode:
        meta_train_split = meta_train_split.sample(cfg.DEV_SAMPLE_TRAIN_SIZE,
                                                   replace=False)
        meta_valid_split = meta_valid_split.sample(cfg.DEV_SAMPLE_VALID_SIZE,
                                                   replace=False)

    logger.info('Target distribution in train: {}'.format(
        meta_train_split['is_attributed'].mean()))
    logger.info('Target distribution in valid: {}'.format(
        meta_valid_split['is_attributed'].mean()))

    logger.info('shuffling data')
    meta_train_split = meta_train_split.sample(frac=1)
    meta_valid_split = meta_valid_split.sample(frac=1)

    data = {
        'input': {
            'X': meta_train_split[cfg.FEATURE_COLUMNS],
            'y': meta_train_split[cfg.TARGET_COLUMNS],
            'X_valid': meta_valid_split[cfg.FEATURE_COLUMNS],
            'y_valid': meta_valid_split[cfg.TARGET_COLUMNS],
        },
    }

    pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    pipeline.fit_transform(data)
    pipeline.clean_cache()
def _predict(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        meta_test = pd.read_csv(params.test_filepath,
                                usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                                dtype=cfg.COLUMN_TYPES['inference'],
                                nrows=cfg.DEV_SAMPLE_TEST_SIZE)
    else:
        meta_test = pd.read_csv(params.test_filepath,
                                usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                                dtype=cfg.COLUMN_TYPES['inference'])

    data_hash_channel_send(ctx, 'Test Data Hash', meta_test)

    data = {
        'input': {
            'X': meta_test[cfg.FEATURE_COLUMNS],
            'y': None,
        },
    }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']

    logger.info('creating submission')
    submission = create_submission(meta_test, y_pred)

    submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
    submission.to_csv(submission_filepath, index=None, encoding='utf-8')
    logger.info('submission saved to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission.head()))
def _evaluate(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        VALID_DAYS_HOURS = cfg.DEV_VALID_DAYS_HOURS
    else:
        VALID_DAYS_HOURS = eval(params.valid_days_hours)

    meta_valid_split = read_csv_time_chunks(params.train_chunks_dir,
                                            prefix='train',
                                            days_hours=VALID_DAYS_HOURS,
                                            usecols=cfg.FEATURE_COLUMNS +
                                            cfg.TARGET_COLUMNS,
                                            dtype=cfg.COLUMN_TYPES['train'],
                                            logger=logger)

    data_hash_channel_send(ctx, 'Evaluation Data Hash', meta_valid_split)

    if dev_mode:
        meta_valid_split = meta_valid_split.sample(cfg.DEV_SAMPLE_VALID_SIZE,
                                                   replace=False)

    logger.info('Target distribution in valid: {}'.format(
        meta_valid_split['is_attributed'].mean()))

    data = {
        'input': {
            'X': meta_valid_split[cfg.FEATURE_COLUMNS],
            'y': None,
        },
    }
    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']
    y_true = meta_valid_split[cfg.TARGET_COLUMNS].values.reshape(-1)

    logger.info('Saving evaluation predictions')
    save_evaluation_predictions(params.experiment_dir, y_true, y_pred,
                                meta_valid_split)

    logger.info('Calculating ROC_AUC Full Scores')
    score = roc_auc_score(y_true, y_pred)
    logger.info('ROC_AUC score on full_validation is {}'.format(score))
    ctx.channel_send('ROC_AUC FULL', 0, score)

    logger.info('Subsetting on submission times')
    index_for_submission_hours = get_submission_hours_index(
        meta_valid_split, cfg.TIMESTAMP_COLUMN, eval(params.submission_hours))
    y_pred_ = y_pred[index_for_submission_hours]
    y_true_ = y_true[index_for_submission_hours]

    logger.info('Calculating ROC_AUC Submission Scores')
    score = roc_auc_score(y_true_, y_pred_)
    logger.info(
        'ROC_AUC score on submission subset of validation is {}'.format(score))
    ctx.channel_send('ROC_AUC SUBSET', 0, score)
def _evaluate(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        VALID_DAYS_HOURS = cfg.DEV_VALID_DAYS_HOURS
    else:
        VALID_DAYS_HOURS = eval(params.valid_days_hours)

    meta_valid_split = read_csv_time_chunks(params.train_chunks_dir,
                                            prefix='train',
                                            days_hours=VALID_DAYS_HOURS,
                                            usecols=cfg.FEATURE_COLUMNS + cfg.TARGET_COLUMNS,
                                            dtype=cfg.COLUMN_TYPES['train'],
                                            logger=logger)

    data_hash_channel_send(ctx, 'Evaluation Data Hash', meta_valid_split)

    if dev_mode:
        meta_valid_split = meta_valid_split.sample(cfg.DEV_SAMPLE_VALID_SIZE, replace=False)

    logger.info('Target distribution in valid: {}'.format(meta_valid_split['is_attributed'].mean()))

    data = {'input': {'X': meta_valid_split[cfg.FEATURE_COLUMNS],
                      'y': None,
                      },
            }
    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']
    y_true = meta_valid_split[cfg.TARGET_COLUMNS].values.reshape(-1)

    logger.info('Saving evaluation predictions')
    save_evaluation_predictions(params.experiment_dir, y_true, y_pred, meta_valid_split)

    logger.info('Calculating ROC_AUC Full Scores')
    score = roc_auc_score(y_true, y_pred)
    logger.info('ROC_AUC score on full_validation is {}'.format(score))
    ctx.channel_send('ROC_AUC FULL', 0, score)

    logger.info('Subsetting on submission times')
    index_for_submission_hours = get_submission_hours_index(meta_valid_split,
                                                            cfg.TIMESTAMP_COLUMN,
                                                            eval(params.submission_hours))
    y_pred_ = y_pred[index_for_submission_hours]
    y_true_ = y_true[index_for_submission_hours]

    logger.info('Calculating ROC_AUC Submission Scores')
    score = roc_auc_score(y_true_, y_pred_)
    logger.info('ROC_AUC score on submission subset of validation is {}'.format(score))
    ctx.channel_send('ROC_AUC SUBSET', 0, score)
def _train(pipeline_name, dev_mode):
    if bool(params.overwrite) and os.path.isdir(params.experiment_dir):
        shutil.rmtree(params.experiment_dir)

    logger.info('reading data in')
    if dev_mode:
        TRAIN_DAYS_HOURS = cfg.DEV_TRAIN_DAYS_HOURS
        VALID_DAYS_HOURS = cfg.DEV_VALID_DAYS_HOURS
    else:
        TRAIN_DAYS_HOURS = eval(params.train_days_hours)
        VALID_DAYS_HOURS = eval(params.valid_days_hours)

    meta_train_split = read_csv_time_chunks(params.train_chunks_dir,
                                            prefix='train',
                                            days_hours=TRAIN_DAYS_HOURS,
                                            usecols=cfg.FEATURE_COLUMNS + cfg.TARGET_COLUMNS,
                                            dtype=cfg.COLUMN_TYPES['train'],
                                            logger=logger)
    meta_valid_split = read_csv_time_chunks(params.train_chunks_dir,
                                            prefix='train',
                                            days_hours=VALID_DAYS_HOURS,
                                            usecols=cfg.FEATURE_COLUMNS + cfg.TARGET_COLUMNS,
                                            dtype=cfg.COLUMN_TYPES['train'],
                                            logger=logger)

    data_hash_channel_send(ctx, 'Training Data Hash', meta_train_split)
    data_hash_channel_send(ctx, 'Validation Data Hash', meta_valid_split)

    if dev_mode:
        meta_train_split = meta_train_split.sample(cfg.DEV_SAMPLE_TRAIN_SIZE, replace=False)
        meta_valid_split = meta_valid_split.sample(cfg.DEV_SAMPLE_VALID_SIZE, replace=False)

    logger.info('Target distribution in train: {}'.format(meta_train_split['is_attributed'].mean()))
    logger.info('Target distribution in valid: {}'.format(meta_valid_split['is_attributed'].mean()))

    logger.info('shuffling data')
    meta_train_split = meta_train_split.sample(frac=1)
    meta_valid_split = meta_valid_split.sample(frac=1)

    data = {'input': {'X': meta_train_split[cfg.FEATURE_COLUMNS],
                      'y': meta_train_split[cfg.TARGET_COLUMNS],
                      'X_valid': meta_valid_split[cfg.FEATURE_COLUMNS],
                      'y_valid': meta_valid_split[cfg.TARGET_COLUMNS],
                      },
            }

    pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    pipeline.fit_transform(data)
    pipeline.clean_cache()
def _predict(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        TEST_DAYS_HOURS = cfg.DEV_TEST_DAYS_HOURS
    else:
        TEST_DAYS_HOURS = eval(params.test_days_hours)

    meta_test_suplement = read_csv_time_chunks(params.test_chunks_dir,
                                               prefix='test',
                                               days_hours=TEST_DAYS_HOURS,
                                               usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                                               dtype=cfg.COLUMN_TYPES['inference'],
                                               logger=logger)
    meta_test = pd.read_csv(params.test_filepath,
                            usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                            dtype=cfg.COLUMN_TYPES['inference'])
    meta_test_full = pd.concat([meta_test_suplement, meta_test], axis=0).reset_index(drop=True)
    meta_test_full.drop_duplicates(subset=cfg.ID_COLUMN, keep='last', inplace=True)
    meta_test_full['click_time'] = pd.to_datetime(meta_test_full['click_time'], format='%Y-%m-%d %H:%M:%S')

    data_hash_channel_send(ctx, 'Test Data Hash', meta_test_full)

    if dev_mode:
        meta_test_full = meta_test_full.sample(cfg.DEV_SAMPLE_TEST_SIZE, replace=False)

    data = {'input': {'X': meta_test_full[cfg.FEATURE_COLUMNS],
                      'y': None,
                      },
            }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']

    logger.info('creating submission full test')
    full_submission = create_submission(meta_test_full, y_pred)
    full_submission_filepath = os.path.join(params.experiment_dir, 'full_submission.csv')
    full_submission.to_csv(full_submission_filepath, index=None, encoding='utf-8')

    logger.info('subsetting submission')
    submission = pd.merge(full_submission, meta_test[cfg.ID_COLUMN], on=cfg.ID_COLUMN, how='inner')

    submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
    submission.to_csv(submission_filepath, index=None, encoding='utf-8')
    logger.info('submission saved to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission.head()))
Beispiel #7
0
def _train(pipeline_name, dev_mode):
    if params.use_english:
        train_filepath = params.train_en_filepath
    else:
        train_filepath = params.train_filepath

    if bool(params.overwrite) and os.path.isdir(params.experiment_dir):
        shutil.rmtree(params.experiment_dir)

    logger.info('reading data in')
    if dev_mode:
        meta_train = pd.read_csv(train_filepath,
                                 usecols=cfg.FEATURE_COLUMNS + cfg.TARGET_COLUMNS + cfg.ITEM_ID_COLUMN,
                                 dtype=cfg.COLUMN_TYPES['train'],
                                 nrows=cfg.DEV_SAMPLE_SIZE)
    else:
        meta_train = pd.read_csv(train_filepath,
                                 usecols=cfg.FEATURE_COLUMNS + cfg.TARGET_COLUMNS + cfg.ITEM_ID_COLUMN,
                                 dtype=cfg.COLUMN_TYPES['train'])

    meta_train_split, meta_valid_split = stratified_train_valid_split(meta_train,
                                                                      target_column=cfg.TARGET_COLUMNS,
                                                                      target_bins=params.target_bins,
                                                                      valid_size=params.validation_size,
                                                                      random_state=1234)

    data_hash_channel_send(ctx, 'Training Data Hash', meta_train_split)
    data_hash_channel_send(ctx, 'Validation Data Hash', meta_valid_split)

    logger.info('Target distribution in train: {}'.format(meta_train_split[cfg.TARGET_COLUMNS].mean()))
    logger.info('Target distribution in valid: {}'.format(meta_valid_split[cfg.TARGET_COLUMNS].mean()))

    logger.info('shuffling data')
    meta_train_split = meta_train_split.sample(frac=1)
    meta_valid_split = meta_valid_split.sample(frac=1)

    data = {'input': {'X': meta_train_split[cfg.FEATURE_COLUMNS],
                      'y': meta_train_split[cfg.TARGET_COLUMNS],
                      'X_valid': meta_valid_split[cfg.FEATURE_COLUMNS],
                      'y_valid': meta_valid_split[cfg.TARGET_COLUMNS],
                      },
            'specs': {'is_train': True}
            }

    pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    pipeline.fit_transform(data)
    pipeline.clean_cache()
Beispiel #8
0
def _evaluate(pipeline_name, dev_mode):
    logger.info('reading data in')
    if params.use_english:
        train_filepath = params.train_en_filepath
    else:
        train_filepath = params.train_filepath

    if dev_mode:
        meta_train = pd.read_csv(train_filepath,
                                 usecols=cfg.FEATURE_COLUMNS + cfg.TARGET_COLUMNS + cfg.ITEM_ID_COLUMN,
                                 dtype=cfg.COLUMN_TYPES['train'],
                                 nrows=cfg.DEV_SAMPLE_SIZE)
    else:
        meta_train = pd.read_csv(train_filepath,
                                 usecols=cfg.FEATURE_COLUMNS + cfg.TARGET_COLUMNS + cfg.ITEM_ID_COLUMN,
                                 dtype=cfg.COLUMN_TYPES['train'])

    _, meta_valid_split = stratified_train_valid_split(meta_train,
                                                       target_column=cfg.TARGET_COLUMNS,
                                                       target_bins=params.target_bins,
                                                       valid_size=params.validation_size,
                                                       random_state=1234)

    data_hash_channel_send(ctx, 'Evaluation Data Hash', meta_valid_split)

    logger.info('Target distribution in valid: {}'.format(meta_valid_split[cfg.TARGET_COLUMNS].mean()))

    data = {'input': {'X': meta_valid_split[cfg.FEATURE_COLUMNS],
                      'y': None,
                      },
            'specs': {'is_train': True}
            }
    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']
    y_true = meta_valid_split[cfg.TARGET_COLUMNS].values.reshape(-1)

    logger.info('Saving evaluation predictions')
    save_evaluation_predictions(params.experiment_dir, y_true, y_pred, meta_valid_split)

    logger.info('Calculating RMSE')
    score = root_mean_squared_error(y_true, y_pred)
    logger.info('RMSE score on validation is {}'.format(score))
    ctx.channel_send('RMSE', 0, score)
def _predict(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        TEST_DAYS_HOURS = cfg.DEV_TEST_DAYS_HOURS
    else:
        TEST_DAYS_HOURS = eval(params.test_days_hours)

    meta_test_suplement = read_csv_time_chunks(
        params.test_chunks_dir,
        prefix='test',
        days_hours=TEST_DAYS_HOURS,
        usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
        dtype=cfg.COLUMN_TYPES['inference'],
        logger=logger)
    meta_test = pd.read_csv(params.test_filepath,
                            usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                            dtype=cfg.COLUMN_TYPES['inference'])
    meta_test_full = pd.concat([meta_test_suplement, meta_test],
                               axis=0).reset_index(drop=True)
    meta_test_full.drop_duplicates(subset=cfg.ID_COLUMN,
                                   keep='last',
                                   inplace=True)
    meta_test_full['click_time'] = pd.to_datetime(meta_test_full['click_time'],
                                                  format='%Y-%m-%d %H:%M:%S')

    data_hash_channel_send(ctx, 'Test Data Hash', meta_test_full)

    if dev_mode:
        meta_test_full = meta_test_full.sample(cfg.DEV_SAMPLE_TEST_SIZE,
                                               replace=False)

    data = {
        'input': {
            'X': meta_test_full[cfg.FEATURE_COLUMNS],
            'y': None,
        },
    }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']

    logger.info('creating submission full test')
    full_submission = create_submission(meta_test_full, y_pred)
    full_submission_filepath = os.path.join(params.experiment_dir,
                                            'full_submission.csv')
    full_submission.to_csv(full_submission_filepath,
                           index=None,
                           encoding='utf-8')

    logger.info('subsetting submission')
    submission = pd.merge(full_submission,
                          meta_test[cfg.ID_COLUMN],
                          on=cfg.ID_COLUMN,
                          how='inner')

    submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
    submission.to_csv(submission_filepath, index=None, encoding='utf-8')
    logger.info('submission saved to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission.head()))