Beispiel #1
0
def main():
    # =========================================
    # === Settings
    # =========================================
    # Get logger
    logger = get_logger(__name__)
    logger.info('Settings')

    # Get argument
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', default='./configs/model_0.json')
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    logger.info(f'config: {args.config}')
    logger.info(f'debug: {args.debug}')

    # Get config
    config = json.load(open(args.config))
    config.update({'args': {'config': args.config, 'debug': args.debug}})

    if config["model"]["name"] == "lightgbm":
        config["model"]["model_params"]["nthread"] = cpu_count()

    # Create a directory for model output
    model_no = pathlib.Path(args.config).stem
    model_output_dir = (pathlib.Path(config['dataset']['output_directory']) /
                        model_no)
    if not model_output_dir.exists():
        model_output_dir.mkdir()

    logger.info(f'model_output_dir: {str(model_output_dir)}')
    logger.debug(f'model_output_dir exists: {model_output_dir.exists()}')
    config.update({'model_output_dir': str(model_output_dir)})

    # =========================================
    # === Loading data
    # =========================================
    logger.info('Loading data')

    # Get train and test
    input_dir = pathlib.Path(config['dataset']['input_directory'])
    train = pd.read_csv(input_dir / 'train.csv')
    test = pd.read_csv(input_dir / 'test.csv')

    # Get target values
    target_column = config['data_type']['target']
    y_train = train[target_column].values

    # =========================================
    # === Loading features
    # =========================================
    logger.info('Loading features')

    # Get features
    x_train, x_test = load_features(config)
    feature_name = x_test.columns
    logger.debug(f'number of features: {len(feature_name)}')

    # =========================================
    # === Adversarial Validation
    # =========================================
    logger.info("adversarial validation")
    train_adv = x_train
    test_adv = x_test
    train_adv['target'] = 0
    test_adv['target'] = 1
    train_test_adv = pd.concat([train_adv, test_adv], axis=0,
                               sort=False).reset_index(drop=True)
    target = train_test_adv['target'].values

    train_set, val_set = train_test_split(train_test_adv,
                                          test_size=0.33,
                                          random_state=71,
                                          shuffle=True)
    x_train_adv = train_set[feature_name]
    y_train_adv = train_set['target']
    x_val_adv = val_set[feature_name]
    y_val_adv = val_set['target']
    logger.debug(f'the number of train set: {len(x_train_adv)}')
    logger.debug(f'the number of valid set: {len(x_val_adv)}')

    train_lgb = lgb.Dataset(x_train_adv, label=y_train_adv)
    val_lgb = lgb.Dataset(x_val_adv, label=y_val_adv)
    lgb_model_params = config["adversarial_validation"]["lgb_model_params"]
    lgb_train_params = config["adversarial_validation"]["lgb_train_params"]
    clf = lgb.train(lgb_model_params,
                    train_lgb,
                    valid_sets=[train_lgb, val_lgb],
                    valid_names=['train', 'valid'],
                    **lgb_train_params)

    feature_imp = pd.DataFrame(sorted(
        zip(clf.feature_importance(importance_type='gain'), feature_name)),
                               columns=['value', 'feature'])
    plt.figure(figsize=(20, 10))
    sns.barplot(x='value',
                y='feature',
                data=feature_imp.sort_values(by='value',
                                             ascending=False).head(20))
    plt.title('LightGBM Features')
    plt.tight_layout()
    plt.savefig(model_output_dir / "feature_importance_adv.png")

    config.update({
        'adversarial_validation_result': {
            'score':
            clf.best_score,
            'feature_importances':
            feature_imp.set_index("feature").sort_values(
                by="value", ascending=False).head(20).to_dict()["value"]
        }
    })

    # =========================================
    # === Train model and predict
    # =========================================
    logger.info('Train model and predict')

    # Get features
    x_train, x_test = load_features(config)
    feature_name = x_test.columns
    logger.debug(f'number of features: {len(feature_name)}')

    # Get folds
    folds_ids = Fold(
        n_splits=config['cv']['n_splits'],
        shuffle=config['cv']['shuffle'],
        random_state=config['cv']['random_state']).get_stratifiedkfold(
            x_train, y_train)

    # Train and predict
    model_name = config['model']['name']
    model_cls = model_map[model_name]
    params = config['model']
    runner = Runner(model_cls, params, model_output_dir,
                    f'Train_{model_cls.__name__}')

    oof_preds, evals_result = runner.train_cv(x_train, y_train, folds_ids)
    config.update(evals_result)
    test_preds = runner.predict_cv(x_test)

    # =========================================
    # === Make submission file
    # =========================================
    sub = create_submission(test, test_preds, target_column)
    sub.to_csv(model_output_dir / 'submission.csv', index=False, header=True)

    # =========================================
    # === Save files
    # =========================================
    save_path = model_output_dir / 'output.json'
    json_dump(config, save_path)

    pd.DataFrame(oof_preds,
                 columns=["target"]).to_csv(model_output_dir / 'oof.csv',
                                            index=False,
                                            header=True)
Beispiel #2
0
def main():
    # =========================================
    # === Settings
    # =========================================
    # Get logger
    logger = get_logger(__name__)
    logger.info('Settings')

    # Get argument
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--config', default='model_lgb_hakubishin_20200317/configs/model_0.json')
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    logger.info(f'config: {args.config}')
    logger.info(f'debug: {args.debug}')

    # Get config
    config = json.load(open(args.config))
    config.update({
        'args': {
            'config': args.config,
            'debug': args.debug
        }
    })
    config["model"]["model_params"]["nthread"] = cpu_count()

    # Create a directory for model output
    model_no = pathlib.Path(args.config).stem
    model_output_dir = (
        pathlib.Path(config['model_dir_name']) /
        pathlib.Path(config['dataset']['output_directory']) / model_no
    )
    if not model_output_dir.exists():
        model_output_dir.mkdir()

    logger.info(f'model_output_dir: {str(model_output_dir)}')
    logger.debug(f'model_output_dir exists: {model_output_dir.exists()}')
    config.update({
        'model_output_dir': str(model_output_dir)
    })

    # =========================================
    # === Loading features
    # =========================================
    logger.info('Loading features')
    logger.info(f'targets: {config["target"]}')
    logger.info(f'features: {config["features"]}')
    logger.info(f'keys: {config["key"]}')
    logger.info(f'folds: {config["folds"]}')

    # features
    x_train = FeatureLoader(
        data_type="training", debugging=args.debug
        ).load_features(config["features"])
    x_test = FeatureLoader(
        data_type=config["test_data_type"], debugging=args.debug
        ).load_features(config["features"])

    # targets
    y_train_set = FeatureLoader(
        data_type="training", debugging=args.debug
        ).load_features(config["target"])

    # keys
    key_test = FeatureLoader(
        data_type=config["test_data_type"], debugging=args.debug
        ).load_features(config["key"])

    # folds
    folds_train = FeatureLoader(
        data_type="training", debugging=args.debug
        ).load_features(config["folds"])

    logger.debug(f'test_data_type: {config["test_data_type"]}')
    logger.debug(f'y_train_set: {y_train_set.shape}')
    logger.debug(f'x_train: {x_train.shape}')
    logger.debug(f'x_test: {x_test.shape}')
    logger.debug(f'key_test: {key_test.shape}')


    # =========================================
    # === Train model and predict
    # =========================================
    logger.info('Train model and predict')

    # Modeling
    target_columns = [
        "reply_engagement",
        "retweet_engagement",
        "retweet_with_comment_engagement",
        "like_engagement",
    ]
    for cat in target_columns:
        logger.info(f'============= {cat} =============')

        # Get target values
        y_train = y_train_set[f"TargetCategories_{cat}"].values

        # Get folds
        folds_col = ["StratifiedGroupKFold_retweet_with_comment_engagement"]
        assert len(folds_col) == 1, "The number of fold column must be one"
        folds = folds_train[folds_col]
        n_fold = folds.max().values[0] + 1
        folds_ids = []

        logger.debug(f"total pos: {y_train.sum()}")
        for i in range(n_fold):
            trn_idx = folds[folds != i].dropna().index
            val_idx = folds[folds == i].dropna().index
            folds_ids.append((trn_idx, val_idx))
            logger.debug(f"{i+1}fold: n_trn={len(trn_idx)}, n_val={len(val_idx)}")
            logger.debug(f"{i+1}fold: trn_pos={y_train[trn_idx].sum()}, val_pos={y_train[val_idx].sum()}")

        # Train and predict
        model_cls = model_map[config['model']['name']]
        model_params = config['model']
        runner = Runner(
            model_cls, model_params, model_output_dir, f'Train_{model_cls.__name__}_{cat}'
        )
        oof_preds, test_preds, evals_result = runner.train_cv(
            x_train, y_train, x_test, folds_ids, config)

        evals_result[f"evals_result_{cat}"] = evals_result["evals_result"]
        evals_result.pop("evals_result")
        config.update(evals_result)

        # Save oof-pred file
        oof_preds_file_name = f"{cat}_oof_pred"
        np.save(model_output_dir / oof_preds_file_name, oof_preds)
        logger.info(f'Save oof-pred file: {model_output_dir/ oof_preds_file_name}')

        # Make submission file
        sub = pd.concat([key_test, pd.Series(test_preds).rename("pred")], axis=1)
        sub = sub[["KeyCategories_tweet_id", "KeyCategories_engaging_user_id", "pred"]]
        sub_file_name = f"{cat}_submission_{config['test_data_type']}.csv"
        sub.to_csv(model_output_dir/ sub_file_name, index=False, header=False)
        logger.info(f'Save submission file: {model_output_dir/ sub_file_name}')

        # Save files (override)
        logger.info('Save files')
        save_path = model_output_dir / 'output.json'
        json_dump(config, save_path)
        logger.info(f'Save model log: {save_path}')

    # =========================================
    # === Upload to GCS
    # =========================================
    if not args.debug:
        logger.info('Upload to GCS')

        bucket_dir_name = config["model_dir_name"] + "/" + model_no
        logger.info(f'bucket_dir_name: {bucket_dir_name}')

        files = list(model_output_dir.iterdir())
        upload_to_gcs(bucket_dir_name, files)
Beispiel #3
0
def main():
    # =========================================
    # === Settings
    # =========================================
    # Get logger
    logger = get_logger(__name__)
    logger.info('Settings')

    # Get argument
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--config', default='model_lgb_hakubishin_20200317/configs/model_0.json')
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    logger.info(f'config: {args.config}')
    logger.info(f'debug: {args.debug}')

    # Get config
    config = json.load(open(args.config))
    config.update({
        'args': {
            'config': args.config,
            'debug': args.debug
        }
    })
    config["model"]["model_params"]["nthread"] = cpu_count()

    # Create a directory for model output
    model_no = pathlib.Path(args.config).stem
    model_output_dir = (
        pathlib.Path(config['model_dir_name']) /
        pathlib.Path(config['dataset']['output_directory']) / model_no
    )
    if not model_output_dir.exists():
        model_output_dir.mkdir()

    logger.info(f'model_output_dir: {str(model_output_dir)}')
    logger.debug(f'model_output_dir exists: {model_output_dir.exists()}')
    config.update({
        'model_output_dir': str(model_output_dir)
    })

    # =========================================
    # === Loading features
    # =========================================
    logger.info('Loading features')
    logger.info(f'targets: {config["target"]}')
    logger.info(f'features: {config["features"]}')

    # features
    x_train = FeatureLoader(
        data_type="training", debugging=args.debug
        ).load_features(config["features"])

    # targets
    y_train_set = FeatureLoader(
        data_type="training", debugging=args.debug
        ).load_features(config["target"])

    # folds
    folds_train = FeatureLoader(
        data_type="training", debugging=args.debug
        ).load_features(config["folds"])

    logger.debug(f'y_train_set: {y_train_set.shape}')
    logger.debug(f'x_train: {x_train.shape}')

    # =========================================
    # === Train model and predict
    # =========================================
    logger.info('Train model and predict')

    # Get target values
    y_train = y_train_set["Target_answered_correctly"].values

    # Get folds
    trn_idx = folds_train.query("Fold_val != 1").index
    val_idx = folds_train.query("Fold_val == 1").index
    folds_ids = [(trn_idx, val_idx)]
    logger.debug(f"n_trn={len(trn_idx)}, n_val={len(val_idx)}")
    logger.debug(f"trn_pos={y_train[trn_idx].sum()}, val_pos={y_train[val_idx].sum()}")

    # Train and predict
    model_cls = model_map[config['model']['name']]
    model_params = config['model']
    runner = Runner(
        model_cls, model_params, model_output_dir, f'{model_cls.__name__}', n_fold=1,
    )
    oof_preds, evals_result, importances = runner.train_cv(
        x_train, y_train, folds_ids)
    config.update(evals_result)

    # Save importances
    importances.mean(axis=1).reset_index().rename(
        columns={"index": "feature", 0: "value"}
    ).sort_values("value", ascending=False).to_csv(
        model_output_dir / "importances.csv", index=False
    )

    # Save oof-pred file
    oof_preds_file_name = f"oof_pred"
    np.save(model_output_dir / oof_preds_file_name, oof_preds)
    logger.info(f'Save oof-pred file: {model_output_dir/ oof_preds_file_name}')

    # Save files (override)
    logger.info('Save files')
    save_path = model_output_dir / 'output.json'
    json_dump(config, save_path)
    logger.info(f'Save model log: {save_path}')

    # =========================================
    # === Upload to GCS
    # =========================================
    if not args.debug:
        logger.info('Upload to GCS')

        bucket_dir_name = config["model_dir_name"] + "/" + model_no
        logger.info(f'bucket_dir_name: {bucket_dir_name}')

        files = list(model_output_dir.iterdir())
        upload_to_gcs(bucket_dir_name, files)
Beispiel #4
0
def main():
    # =========================================
    # === Settings
    # =========================================
    # Get logger
    logger = get_logger(__name__)
    logger.info('Settings')

    # Get argument
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', default='./configs/model_1dcnn_0.json')
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    logger.info(f'config: {args.config}')
    logger.info(f'debug: {args.debug}')

    # Get config
    config = json.load(open(args.config))
    config.update({'args': {'config': args.config, 'debug': args.debug}})

    if config["model"]["name"] == "lightgbm":
        config["model"]["model_params"]["nthread"] = cpu_count()

    # Create a directory for model output
    model_no = pathlib.Path(args.config).stem
    model_output_dir = (pathlib.Path(config['dataset']['output_directory']) /
                        model_no)
    if not model_output_dir.exists():
        model_output_dir.mkdir()

    logger.info(f'model_output_dir: {str(model_output_dir)}')
    logger.debug(f'model_output_dir exists: {model_output_dir.exists()}')
    config.update({'model_output_dir': str(model_output_dir)})

    # =========================================
    # === Loading data
    # =========================================
    logger.info('Loading data')

    # Get train and test
    input_dir = pathlib.Path(config['dataset']['input_directory'])
    train = pd.read_csv(input_dir / 'train.csv')
    test = pd.read_csv(input_dir / 'test.csv')

    spectrum = pd.read_csv(input_dir / 'spectrum_stack.csv')
    spectrum_fitting = pd.read_csv(input_dir / 'spectrum_fitting_stack.csv')
    wv_cols = [f"wavelength_{i}" for i in range(512)]
    wv_fit_cols = [f"fitting_wavelength_{i}" for i in range(512)]

    train_spectrum = pd.merge(train,
                              spectrum,
                              on="spectrum_filename",
                              how="left")
    test_spectrum = pd.merge(test,
                             spectrum,
                             on="spectrum_filename",
                             how="left")
    train_spectrum = pd.merge(train_spectrum,
                              spectrum_fitting,
                              on="spectrum_filename",
                              how="left")
    test_spectrum = pd.merge(test_spectrum,
                             spectrum_fitting,
                             on="spectrum_filename",
                             how="left")

    train_std = np.std(train_spectrum[wv_cols].values, axis=1, keepdims=True)
    test_std = np.std(test_spectrum[wv_cols].values, axis=1, keepdims=True)
    train_spectrum[wv_cols] = train_spectrum[wv_cols].values / train_std
    test_spectrum[wv_cols] = test_spectrum[wv_cols].values / test_std

    spectrum_cols = wv_cols + wv_fit_cols
    train_spectrum = train_spectrum[spectrum_cols]
    test_spectrum = test_spectrum[spectrum_cols]

    # Get target values
    target_column = config['data_type']['target']
    y_train = train[target_column].values

    # =========================================
    # === Loading features
    # =========================================
    logger.info('Loading features')

    # Get features
    x_train, x_test = load_features(config)
    feature_name = x_test.columns
    logger.debug(f'number of features: {len(feature_name)}')

    # =========================================
    # === features preprocess
    # =========================================
    x_total = x_train.append(x_test).reset_index(drop=True)
    remove_features = [c for c in x_total.columns if c.find("layout_x") != -1]
    remove_features += [c for c in x_total.columns if c.find("layout_y") != -1]
    x_total.drop(columns=remove_features, inplace=True)

    x_total = pd.get_dummies(
        x_total, columns=["LabelEncoding_exc_wl", "LabelEncoding_layout_a"])
    x_total.fillna(0, inplace=True)

    from sklearn.preprocessing import StandardScaler
    numeric_features = [
        c for c in x_total.columns if c.find("LabelEncoding_") == -1
    ]
    sc = StandardScaler()
    x_total[numeric_features] = sc.fit_transform(x_total[numeric_features])

    x_train = x_total.iloc[:len(train)]
    x_test = x_total.iloc[len(train):].reset_index(drop=True)

    x_train = pd.concat([x_train, train_spectrum], axis=1)
    x_test = pd.concat([x_test, test_spectrum], axis=1)
    logger.debug(f'number of features with spec in train: {x_train.shape}')
    logger.debug(f'number of features with spec in test: {x_test.shape}')

    # =========================================
    # === Train model and predict
    # =========================================
    logger.info('Train model and predict')

    # Get folds
    folds_ids = Fold(
        n_splits=config['cv']['n_splits'],
        shuffle=config['cv']['shuffle'],
        random_state=config['cv']['random_state']).get_stratifiedkfold(
            x_train, y_train)

    # Train and predict
    model_name = config['model']['name']
    model_cls = model_map[model_name]
    params = config['model']
    runner = Runner(model_cls, params, model_output_dir,
                    f'Train_{model_cls.__name__}')

    oof_preds, evals_result = runner.train_cv(x_train, y_train, folds_ids)
    config.update(evals_result)
    test_preds = runner.predict_cv(x_test)

    # =========================================
    # === Make submission file
    # =========================================
    sub = create_submission(test, test_preds, target_column)
    sub.to_csv(model_output_dir / 'submission.csv', index=False, header=True)

    # =========================================
    # === Save files
    # =========================================
    save_path = model_output_dir / 'output.json'
    json_dump(config, save_path)

    pd.DataFrame(oof_preds,
                 columns=["target"]).to_csv(model_output_dir / 'oof.csv',
                                            index=False,
                                            header=True)