Ejemplo n.º 1
0
def hydra_main(cfg: DictConfig) -> None:
    # Set up python logging.
    logger = logging.getLogger()
    if is_rank_zero():
        logger.setLevel(cfg.log_level)
        logging.info(OmegaConf.to_yaml(cfg))

    test(cfg)
Ejemplo n.º 2
0
def run(cfg: DictConfig) -> None:
    os.chdir(hydra.utils.get_original_cwd())
    log.info(OmegaConf.to_yaml(cfg))
    cfg['device'] = ('cuda' if torch.cuda.is_available() else 'cpu')
    cfg['list_seed'] = [i for i in range(cfg.model.nseed)]
    verbose = 0
    local_path = '../'
    path = f'{local_path}input/lish-moa'
    path_model = f'{local_path}models'
    cfg['path_model'] = path_model
    # print(os.listdir(f'{local_path}../'))
    os.chdir(hydra.utils.get_original_cwd())
    log.info(OmegaConf.to_yaml(cfg))
    cfg['device'] = ('cuda' if torch.cuda.is_available() else 'cpu')
    cfg['list_seed'] = [i for i in range(cfg.model.nseed)]
    verbose = 1
    local_path = '../'
    path = f'{local_path}input/lish-moa'
    path_model = f'{local_path}models'
    cfg['path_model'] = path_model
    # print(os.listdir(f'{local_path}../'))

    ######################################
    # data_load and preprocess
    ######################################

    pretrain_model = False
    data_dict = load_and_preprocess_data_index(
        cfg,
        path,
        pca_append_test=True,
        variancethreshold_append_test=False,
        verbose=1)

    CV = DrugAwareMultilabelStratifiedKFold(n_splits=cfg.model.nfolds,
                                            shuffle=False,
                                            random_state=42)
    ##################################################
    # Train
    ##################################################
    SEED = [0]
    oof = np.zeros((len(data_dict['train']), len(data_dict['target_cols'])))
    predictions = np.zeros(
        (len(data_dict['test']), len(data_dict['target_cols'])))
    for seed in tqdm([0], leave=verbose):
        xgb_params = {
            'booster': 'gbtree',
            'tree_method': 'gpu_hist',
            'min_child_weight': 31.58,
            'learning_rate': 0.05,
            'colsample_bytree': 0.65,
            'gamma': 3.69,
            'max_delta_step': 2.07,
            'max_depth': 10,
            'n_estimators': 10,
            'subsample': 0.86,
            'verbosity': 1,
        }
        return_run_k_fold = get_xgboost(data_dict,
                                        cfg,
                                        xgb_params,
                                        CV,
                                        seed=seed,
                                        file_prefix='x1',
                                        optimization=False,
                                        verbose=0)

        if cfg.model.train_models:
            oof_, predictions_ = return_run_k_fold
            oof += oof_ / len(SEED)
        else:
            predictions_ = return_run_k_fold
        predictions += predictions_ / len(SEED)
        gc.collect()

    train = data_dict['train'].copy()
    test = data_dict['test'].copy()
    target = data_dict['target'].copy()
    feature_cols = data_dict['feature_cols']
    target_cols = data_dict['target_cols']
    train_targets_scored = data_dict['train_targets_scored']
    test_features = data_dict['test_features']

    if not pretrain_model:
        train[target_cols] = oof
    test[target_cols] = predictions

    ##################################################
    # valodation and save
    ##################################################

    if not pretrain_model:
        y_true = train_targets_scored[target_cols].values
        valid_results = train_targets_scored.drop(columns=target_cols).merge(
            train[target_cols], on='sig_id', how='left').fillna(0)
        y_pred = valid_results[target_cols].values

        score = 0
        for i in range(len(target_cols)):
            score_ = log_loss(y_true[:, i], y_pred[:, i])
            score += score_ / len(target_cols)

        print(f"CV log_loss: {score}")
        log.info(f"CV log_loss: {score}")
        log.info(f"y_true.shape: {y_true.shape}")
        log.info(f"y_pred.shape: {y_pred.shape}")

    # sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id'] + target_cols], on='sig_id',
    #                                                         how='left').fillna(0)
    # sub.to_csv('submission.csv', index=False)
    # log.info(f"sub.shape: {sub.shape}")

    res = test[target_cols]
    corner_case = test_features[test_features['cp_type'] == 'ctl_vehicle']
    zeros = np.zeros((corner_case.shape[0], len(target_cols)))
    corner_case[target_cols] = zeros
    corner_case = corner_case[target_cols]
    res = pd.concat([res, corner_case], axis=0)

    res.to_csv('submission.csv')
    log.info(f"res.shape: {res.shape}")
    log.info(f"test[target_cols].shape: {test[target_cols].shape}")

    if not pretrain_model:
        return score
    else:
        return 0
Ejemplo n.º 3
0
def run(cfg: DictConfig) -> None:
    os.chdir(hydra.utils.get_original_cwd())
    log.info(OmegaConf.to_yaml(cfg))
    cfg['device'] = ('cuda' if torch.cuda.is_available() else 'cpu')
    cfg['list_seed'] = [i for i in range(cfg.model.nseed)]
    verbose = 1
    local_path = '../'
    path = f'{local_path}input/lish-moa'
    path_model = f'{local_path}models'
    cfg['path_model'] = path_model
    # print(os.listdir(f'{local_path}../'))

    ######################################
    # data_load and preprocess
    ######################################

    pretrain_model = False
    data_dict = load_and_preprocess_data(cfg,
                                         path,
                                         pca_append_test=False,
                                         variancethreshold_append_test=False,
                                         verbose=1)

    ######################################
    # cv
    ######################################
    CV = MultilabelStratifiedKFold(n_splits=cfg.model.nfolds, random_state=42)

    ##################################################
    # Train
    ##################################################
    oof = np.zeros((len(data_dict['train']), len(data_dict['target_cols'])))
    predictions = np.zeros(
        (len(data_dict['test']), len(data_dict['target_cols'])))
    for seed in tqdm(cfg['list_seed'], leave=verbose):
        return_run_k_fold = run_k_fold_nn(data_dict,
                                          cfg,
                                          cv=CV,
                                          seed=seed,
                                          file_prefix='m1',
                                          pretrain_model=pretrain_model,
                                          verbose=verbose)
        if not pretrain_model:
            oof_, predictions_ = return_run_k_fold
            oof += oof_ / cfg.model.nseed
        else:
            predictions_ = return_run_k_fold
        predictions += predictions_ / cfg.model.nseed
        gc.collect()

    train = data_dict['train'].copy()
    test = data_dict['test'].copy()
    target = data_dict['target'].copy()
    feature_cols = data_dict['feature_cols']
    target_cols = data_dict['target_cols']
    train_targets_scored = data_dict['train_targets_scored']
    test_features = data_dict['test_features']

    if not pretrain_model:
        train[target_cols] = oof
    test[target_cols] = predictions

    ##################################################
    # valodation and save
    ##################################################

    if not pretrain_model:
        y_true = train_targets_scored[target_cols].values
        valid_results = train_targets_scored.drop(columns=target_cols).merge(
            train[['sig_id'] + target_cols], on='sig_id', how='left').fillna(0)
        y_pred = valid_results[target_cols].values

        score = 0
        for i in range(len(target_cols)):
            score_ = log_loss(y_true[:, i], y_pred[:, i])
            score += score_ / len(target_cols)

        print(f"CV log_loss: {score}")
        log.info(f"CV log_loss: {score}")
        log.info(f"y_true.shape: {y_true.shape}")
        log.info(f"y_pred.shape: {y_pred.shape}")

    # sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id'] + target_cols], on='sig_id',
    #                                                         how='left').fillna(0)
    # sub.to_csv('submission.csv', index=False)
    # log.info(f"sub.shape: {sub.shape}")

    res = test[['sig_id'] + target_cols]
    corner_case = test_features[test_features['cp_type'] == 'ctl_vehicle']
    zeros = np.zeros((corner_case.shape[0], len(target_cols)))
    corner_case[target_cols] = zeros
    corner_case = corner_case[['sig_id'] + target_cols]
    res = pd.concat([res, corner_case], axis=0)

    res.to_csv('submission.csv', index=False)

    if not pretrain_model:
        return score
    else:
        return 0
Ejemplo n.º 4
0
def run(cfg: DictConfig) -> None:
    os.chdir(hydra.utils.get_original_cwd())
    log.info(OmegaConf.to_yaml(cfg))
    cfg['device'] = ('cuda' if torch.cuda.is_available() else 'cpu')
    cfg['list_seed'] = [i for i in range(cfg.model.nseed)]
    verbose = 1
    local_path = '../'
    path = f'{local_path}input/lish-moa'
    path_model = f'{local_path}models'
    cfg['path_model'] = path_model
    # print(os.listdir(f'{local_path}../'))

    # data_load
    train_features = pd.read_csv(f'{path}/train_features.csv')
    test_features = pd.read_csv(f'{path}/test_features.csv')
    train_targets_scored = pd.read_csv(f'{path}/train_targets_scored.csv')
    train_targets_nonscored = pd.read_csv(
        f'{path}/train_targets_nonscored.csv')

    train_features = change_type(train_features)
    test_features = change_type(test_features)
    train_targets_scored = change_type(train_targets_scored)
    log.info(f"train_targets_scored.shape: {train_targets_scored.shape}")
    sample_submission = pd.read_csv(f'{path}/sample_submission.csv')
    # sub = pd.read_csv(f'{path}/sample_submission.csv')

    log.info(
        f"n_comp_genes: {cfg.model.n_comp_genes}, n_comp_cells: {cfg.model.n_comp_cells}, total: "
        f"{cfg.model.n_comp_genes + cfg.model.n_comp_cells}.")

    GENES = [col for col in train_features.columns if col.startswith('g-')]
    CELLS = [col for col in train_features.columns if col.startswith('c-')]

    train_features_return, test_features_return = \
        quantile_transformer(train_features, test_features, features=GENES+CELLS,
                             n_quantiles=cfg.quantile_transformer.n_quantiles,
                             output_distribution=cfg.quantile_transformer.output_distribution)
    del train_features, test_features
    gc.collect()
    train_features = train_features_return
    test_features = test_features_return
    log.info(f"End prearation data transform.\n"
             f"train_features.shape: {train_features.shape}\n"
             f"test_features.shape: {test_features.shape}\n"
             f"{'_' * 80}\n")

    ##################################################
    # PCA
    ##################################################

    train_features_return, test_features_return = \
        get_pca_transform(train_features, test_features, features=GENES, n_components=cfg.model.n_comp_genes,
                          flag='GENES', test_append=False)
    train_features = pd.concat((train_features, train_features_return), axis=1)
    test_features = pd.concat((test_features, test_features_return), axis=1)
    del train_features_return, test_features_return
    gc.collect()

    train_features_return, test_features_return = \
        get_pca_transform(train_features, test_features, features=CELLS, n_components=cfg.model.n_comp_cells,
                          flag='CELLS', test_append=False)
    train_features = pd.concat((train_features, train_features_return), axis=1)
    test_features = pd.concat((test_features, test_features_return), axis=1)
    del train_features_return, test_features_return
    gc.collect()
    ##################################################
    # Start: Feature selection
    ##################################################
    train_features_return, test_features_return = \
        split_with_variancethreshold(train_features, test_features,
                                     variance_threshold_for_fs=cfg.model.variance_threshold_for_fs,
                                     categorical=['sig_id', 'cp_type', 'cp_time', 'cp_dose'],
                                     test_append=False)
    del train_features, test_features
    gc.collect()
    train_features = train_features_return
    test_features = test_features_return

    ##################################################
    # Start: Zero hack target & prepare train test
    ##################################################
    if verbose:
        print(f"Preparation of train & test:")

    train = train_features.merge(train_targets_scored, on='sig_id')
    train = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
    test = test_features[
        test_features['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

    target = train[train_targets_scored.columns]
    train = train.drop('cp_type', axis=1)
    test = test.drop('cp_type', axis=1)

    target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

    log.debug(f"Preparation of train & test.\n"
              f"train.shape: {train.shape}\n"
              f"test.shape: {test.shape}\n"
              f"{'_' * 80}\n")

    ##################################################
    # cv folds
    ##################################################
    folds = train.copy()
    mskf = MultilabelStratifiedKFold(n_splits=cfg.model.nfolds,
                                     random_state=cfg['list_seed'][0])

    for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
        folds.loc[v_idx, 'kfold'] = int(f)

    folds['kfold'] = folds['kfold'].astype(int)

    log.debug(f"train.shape: {train.shape}"
              f"folds.shape: {folds.shape}"
              f"test.shape: {test.shape}"
              f"target.shape: {target.shape}")

    gc.collect()
    ##################################################
    # Preprocessing feature_cols
    ##################################################
    feature_cols = [
        c for c in preprocess_data(folds, cfg.model.patch1).columns
        if c not in target_cols
    ]
    feature_cols = [c for c in feature_cols if c not in ['kfold', 'sig_id']]
    num_features = len(feature_cols)
    num_targets = len(target_cols)

    ##################################################
    # END PREPROCESS
    ##################################################

    CV = MultilabelStratifiedKFold(n_splits=cfg.model.nfolds, random_state=42)
    data_dict = {
        'train': preprocess_data(train),
        'target': target,
        'test': preprocess_data(test),
        'feature_cols': feature_cols,
        'target_cols': target_cols
    }
    # base_model_def(data_dict, params, cv=cv, optimization=False, verbose=0):

    ##################################################
    # Train
    ##################################################
    SEED = cfg['list_seed']
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))

    for seed in tqdm(SEED, leave=verbose):
        return_run_k_fold = run_k_fold(cfg.model.nfolds, seed, cfg, folds,
                                       train, test, feature_cols, target_cols,
                                       num_features, num_targets, target,
                                       verbose)
        if cfg.model.train_models:
            oof_, predictions_ = return_run_k_fold
            oof += oof_ / len(SEED)
        else:
            predictions_ = return_run_k_fold
        predictions += predictions_ / len(SEED)
        gc.collect()

    if cfg.model.train_models:
        train[target_cols] = oof
    test[target_cols] = predictions

    ##################################################
    # valodation and save
    ##################################################

    if cfg.model.train_models:
        y_true = train_targets_scored[target_cols].values
        valid_results = train_targets_scored.drop(columns=target_cols).merge(
            train[['sig_id'] + target_cols], on='sig_id', how='left').fillna(0)
        y_pred = valid_results[target_cols].values

        score = 0
        for i in range(len(target_cols)):
            score_ = log_loss(y_true[:, i], y_pred[:, i])
            score += score_ / num_targets

        print(f"CV log_loss: {score}")
        log.info(f"CV log_loss: {score}")
        log.info(f"y_true.shape: {y_true.shape}")
        log.info(f"y_pred.shape: {y_pred.shape}")

    # sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id'] + target_cols], on='sig_id',
    #                                                         how='left').fillna(0)
    # sub.to_csv('submission.csv', index=False)
    # log.info(f"sub.shape: {sub.shape}")

    res = test[['sig_id'] + target_cols]
    corner_case = test_features[test_features['cp_type'] == 'ctl_vehicle']
    zeros = np.zeros((corner_case.shape[0], len(target_cols)))
    corner_case[target_cols] = zeros
    corner_case = corner_case[['sig_id'] + target_cols]
    res = pd.concat([res, corner_case], axis=0)

    res.to_csv('submission.csv', index=False)

    if cfg.model.train_models:
        return score
    else:
        return 0