Beispiel #1
0
def make_relative_position():
    with utils.timer("read data"):
        train_df = compe_data.read_train()
        test_df = compe_data.read_test()
        st_df = compe_data.read_structures()
    # merge structure to train and test
    with utils.timer("merge atom_st"):
        train_df = map_atom_info(train_df, st_df, 0)
        train_df = map_atom_info(train_df, st_df, 1)
        test_df = map_atom_info(test_df, st_df, 0)
        test_df = map_atom_info(test_df, st_df, 1)
    # make structure dict
    #with utils.timer("make st_dict"):
    #    st_dict = make_st_dict(st_df)
    #    del st_df
    #    gc.collect()
    # train test each make relative position matrix
    train_df = each_make(train_df, st_df)
    test_df = each_make(test_df, st_df)
Beispiel #2
0
def train(train_df=None,
          y=None,
          use_feats=None,
          train_type=None,
          permutation=False,
          cat_cols=[],
          single_fold=False,
          log_dir="./",
          target_mode="target",
          meta_col=None):
    cat_cols = [c for c in cat_cols if c in use_feats]
    full_train = train_df[use_feats]
    train_df_for_score = train_df[["type", "scalar_coupling_constant"]]
    logger = logging.getLogger(__name__)
    log_callback = log_evaluation(logger, period=n_log_period)
    #if target_mode == "target":
    #    lgb_param = lgb_param_target
    #else:
    #    lgb_param = lgb_param_meta
    lgb_param = get_param(train_type, meta_col)
    clfs = []
    importances = pd.DataFrame()
    oof_preds = np.zeros(len(
        compe_data.read_train()))  #np.zeros(len(full_train))
    oof_preds[:] = np.nan
    # save use feature
    utils.save_list(os.path.join(log_dir, "train_feats.csv"),
                    full_train.columns.tolist())
    # get validation split index list
    split_idx_list = validation.get_split_list()[train_type]
    # fold training
    logging.info("------ {} fold learning --------".format(
        len(split_idx_list)))
    for fold_, (trn_, val_) in enumerate(split_idx_list):
        logging.info("-----------------")
        logging.info("{}th Fold Start".format(fold_ + 1))
        if 0:  #target_mode == "target":
            train_x, train_y = index_augment(full_train.loc[trn_], y.loc[trn_])
        else:
            train_x = full_train.loc[trn_]
            train_y = y.loc[trn_]
        trn_data = lgb.Dataset(train_x,
                               label=train_y,
                               categorical_feature=cat_cols)
        val_data = lgb.Dataset(full_train.loc[val_],
                               label=y.loc[val_],
                               categorical_feature=cat_cols)
        logging.info("train shape:({0},{1})".format(full_train.shape[1],
                                                    len(trn_)))

        clf = lgb.train(
            lgb_param,
            trn_data,
            n_round,
            valid_sets=[trn_data, val_data],
            verbose_eval=n_verbose,
            early_stopping_rounds=100,
            #feval = eval_f1,
            callbacks=[log_callback],
        )
        # oof prediction
        #mini_idx_srs = full_train.reset_index().iloc[full_train.loc[val_]]
        oof_preds[val_] = \
                    clf.predict(full_train.loc[val_], num_iteration=clf.best_iteration)
        # permutation importance
        if permutation:
            fold_cv_score = competition_metric(train_df_for_score.loc[val_],
                                               oof_preds[val_])
            feature_util.permutation_importance(
                model=clf,
                val_x=full_train.loc[val_],
                val_y=train_df_for_score.loc[val_],
                cv_score=fold_cv_score,
                cols=full_train.columns,
                metric_func=competition_metric,
                suffix=train_type,
                output=True,
                out_dir=log_dir)
        # feature importance
        imp_df = pd.DataFrame()
        imp_df['feature'] = full_train.columns
        imp_df['gain'] = clf.feature_importance(importance_type="gain")
        imp_df['fold'] = fold_ + 1
        importances = pd.concat([importances, imp_df], axis=0, sort=False)
        clfs.append(clf)
        if permutation:
            #仮でtrainのidxのoof_predsも0で埋めておく
            oof_preds[trn_] = 0
            break
        if single_fold:
            break
    # oof predsがtype別になる前のサイズなのでnanを除くことでもとのサイズにする
    oof_preds = oof_preds[~np.isnan(oof_preds)]
    # total CV value
    if permutation:
        cv_score = 0
        logging.info("This is permutation importance so cv score skip!!")
    elif target_mode == "target":
        cv_score = competition_metric(train_df_for_score, oof_preds)
        logging.info("CV Score {}".format(cv_score))
    else:
        cv_score = mean_absolute_error(y.values, oof_preds)
        logging.info("CV Score {}".format(cv_score))
    # CV Predicitionの対象 = TimeValidのtrain期間
    return clfs, importances, cv_score, oof_preds
Beispiel #3
0
def learn():
    with utils.timer("load train and test data"):
        train_pair = compe_data.read_train()
        test_pair = compe_data.read_test()
        st_df = compe_data.read_structures()
    with utils.timer("make atomic data frame"):
        train_df, test_df = make_atomic_data(train_pair, test_pair, st_df)
    # set validation fold
    validation.set_fold(
            fold_type = "GroupKFold",
            fold_num = settings.fold_num,
            random_state = settings.fold_seed,
            shuffle_flg = True,
    )
    # validationのsplitは、本来のターゲットのindexに合わせる
    validation.make_splits(
            train_pair, 
            train_pair[features.TARGET_COL],
            group_col=features.GROUP_COL
    )
    validation.make_atom_splits(train_df, train_pair)
    # make feature
    with utils.timer("feature make"):
        train_df, test_df, cat_cols = atom_features.make(
            train_df,
            test_df,
            train_pair,
            test_pair
        )
    pd.set_option("max_columns",100)
    ## for debug
    train_df.head(10).to_csv(
            os.path.join(log_dir,"check_atom_train_df.csv"),index=False)
    # predict only train feature
    # training
    
    with utils.timer("atomic_meta feature training"):
        meta_train_df = pd.DataFrame()
        meta_test_df = pd.DataFrame()
        meta_features = compe_data.read_mulliken_charges() #仮。変わるかも
        for meta_col in ["mulliken_charge"]:
            meta_feat = meta_features[meta_col]
            logging.info("-------- learning {} ---------".format(meta_col))
            use_feats = [x for x in train_df.columns 
                            if not x in atom_features.EXCEPT_FEATURES]
            train_y = meta_feat
            # training per type
            test_preds = np.zeros(len(test_df))
            clfs, importances, val_score, oof_preds = \
                atom_training.train(
                    train_df,
                    train_y, 
                    use_feats = use_feats,
                    permutation=False, 
                    cat_cols = cat_cols,
                    log_dir = log_dir,
                    target_mode = "meta"
            )
            utils.save_importances(importances_=importances, 
                    save_dir=log_dir, prefix="meta_col_")
            for clf in clfs:
                test_preds += clf.predict(test_df.loc[:, use_feats], 
                                    num_iteration=clf.best_iteration) / len(clfs)
            meta_train_df[meta_col] = oof_preds
            meta_test_df[meta_col] = test_preds
            meta_train_df["molecule_name"] = train_df["molecule_name"]
            meta_train_df["atom_index"] = train_df["atom_index"]
            meta_test_df["molecule_name"] = test_df["molecule_name"]
            meta_test_df["atom_index"] = test_df["atom_index"]
        # merge train and test_df
        meta_train_df.to_pickle("../pickle/atomic_meta_train.pkl")
        meta_test_df.to_pickle("../pickle/atomic_meta_test.pkl")
Beispiel #4
0
def execute():
    with utils.timer("load train and test data"):
        train_df = compe_data.read_train()
        test_df = compe_data.read_test()
        st_df = compe_data.read_structures()
    # set validation fold
    validation.set_fold(
        fold_type="GroupKFold",
        fold_num=2,  #settings.fold_num,
        random_state=2222,
        shuffle_flg=True,
    )
    validation.make_splits(train_df,
                           train_df[features.TARGET_COL],
                           group_col=features.GROUP_COL)
    # make feature
    with utils.timer("feature make"):
        train_df, test_df, cat_cols = features.make(train_df, test_df, st_df)
    del test_df
    gc.collect()
    # predict only train feature
    # training
    #with utils.timer("reduce memory"):
    #    train_df = utils.reduce_memory(train_df)
    #    #test_df = utils.reduce_memory(test_df)
    flg_use_scc_meta = True  #False
    if flg_use_scc_meta:
        with utils.timer("meta feature training"):
            flg_meta_learn = True
            feature_selection = True
            if flg_meta_learn:
                meta_train_df = pd.DataFrame()
                meta_features = compe_data.read_scalar_coupling_contributions()
                for meta_col in ["fc", "sd", "pso", "dso"]:
                    meta_feat = meta_features[meta_col]
                    logging.info(
                        "******************************".format(meta_col))
                    logging.info(
                        "******** learning {} *********".format(meta_col))
                    logging.info(
                        "******************************".format(meta_col))
                    use_feats = [
                        x for x in train_df.columns
                        if not x in features.EXCEPT_FEATURES
                    ]
                    train_y = meta_feat  #train_df[features.TARGET_COL]
                    # training per type
                    types = train_df["type"].unique()
                    oof = np.zeros(len(train_df))
                    types_clfs = {}
                    types_scores = {}
                    for type_name in types:
                        logging.info("----- training type == {} -----".format(
                            type_name))
                        type_idx = train_df.type == type_name
                        if feature_selection:
                            type_use_feats = feature_util.feature_select(
                                use_feats,
                                importance_df=pd.read_csv(
                                    f"./importance/permu_importance{type_name}.csv",
                                    names=["feature", "importance"]),
                                threshold=-0.05,
                                reverse=True,
                            )
                        else:
                            type_use_feats = use_feats.copy()
                        select_feats = features.select_type_feats(
                            use_feats, type_name)
                        param_tuning(
                            train_df=train_df.loc[type_idx],
                            y=train_y.loc[type_idx],
                            use_feats=select_feats,
                            objective_metric=
                            mean_absolute_error,  #competition_metric,
                            train_type=type_name,
                            type_name=meta_col + "_" + type_name,
                            cat_cols=cat_cols,
                        )
                else:
                    meta_train_df = pd.read_pickle("../pickle/meta_train.pkl")
            #train_df = utils.fast_concat(train_df, meta_train_df)
            #test_df = utils.fast_concat(test_df, meta_test_df)
    """
Beispiel #5
0
def learn():
    with utils.timer("load train and test data"):
        train_df = compe_data.read_train()
        test_df = compe_data.read_test()
        st_df = compe_data.read_structures()
    # set validation fold
    validation.set_fold(
        fold_type="GroupKFold",
        fold_num=settings.fold_num,
        random_state=settings.fold_seed,
        shuffle_flg=True,
    )
    validation.make_splits(train_df,
                           train_df[features.TARGET_COL],
                           group_col=features.GROUP_COL)
    # make feature
    with utils.timer("feature make"):
        train_df, test_df, cat_cols = features.make(train_df, test_df, st_df)
    ## for debug
    train_df.head(30).to_csv(os.path.join(log_dir, "check_train_df.csv"),
                             index=False)

    # predict only train feature
    # training
    flg_use_scc_meta = True  #False
    if flg_use_scc_meta:
        with utils.timer("meta feature training"):
            flg_meta_learn = False  #True#True
            feature_selection = True
            if flg_meta_learn:
                meta_train_df = pd.DataFrame()
                meta_test_df = pd.DataFrame()
                meta_features = compe_data.read_scalar_coupling_contributions()
                for meta_col in ["fc", "sd", "pso", "dso"]:
                    meta_feat = meta_features[meta_col]
                    logging.info("******************************")
                    logging.info(
                        "******** learning {} *********".format(meta_col))
                    logging.info("******************************")
                    use_feats = [
                        x for x in train_df.columns
                        if not x in features.EXCEPT_FEATURES
                    ]
                    train_y = meta_feat  #train_df[features.TARGET_COL]
                    # training per type
                    types = train_df["type"].unique()
                    oof = np.zeros(len(train_df))
                    test_preds = np.zeros(len(test_df))
                    types_clfs = {}
                    types_scores = {}
                    for type_name in types:
                        logging.info("----- training type == {} -----".format(
                            type_name))
                        type_idx = train_df.type == type_name
                        test_type_idx = test_df.type == type_name
                        if feature_selection:
                            type_use_feats = feature_util.feature_select(
                                use_feats,
                                importance_df=pd.read_csv(
                                    f"./importance/permu_importance{type_name}.csv",
                                    names=["feature", "importance"]),
                                threshold=-0.05,
                                reverse=True,
                            )
                        else:
                            type_use_feats = use_feats.copy()
                        select_feats = features.select_type_feats(
                            use_feats, type_name)
                        clfs, importances, val_score, oof_preds = \
                            training.train(
                                train_df.loc[type_idx],
                                train_y.loc[type_idx],
                                use_feats = select_feats,
                                train_type = type_name,
                                permutation=False,
                                cat_cols = cat_cols,
                                log_dir = log_dir,
                                target_mode = "meta",
                                meta_col = meta_col
                            )
                        types_clfs[type_name] = clfs
                        types_scores[type_name] = val_score
                        utils.save_importances(
                            importances_=importances,
                            save_dir=log_dir,
                            prefix=f"{meta_col}_{type_name}")
                        oof[type_idx] = oof_preds
                        for clf in clfs:
                            test_preds[test_type_idx] += \
                                    clf.predict(test_df.loc[test_type_idx, use_feats],
                                                num_iteration=clf.best_iteration) / len(clfs)
                    #total_cv = training.metric(train_df, oof)
                    #print("TotalCV = {:.5f}".format(total_cv))
                    meta_train_df[meta_col] = oof
                    meta_test_df[meta_col] = test_preds
                    logging.info(
                        "---------- meta {} types val score ----------".format(
                            meta_col))
                    for type_name, score in types_scores.items():
                        logging.info("{0} : {1}".format(type_name, score))
                # merge train and test_df
                meta_train_df.to_pickle("../pickle/meta_train.pkl")
                meta_test_df.to_pickle("../pickle/meta_test.pkl")
            else:
                meta_train_df = pd.read_pickle("../pickle/gnn_meta_train.pkl")
                meta_test_df = pd.read_pickle("../pickle/gnn_meta_test.pkl")
            train_df = utils.fast_concat(train_df, meta_train_df)
            test_df = utils.fast_concat(test_df, meta_test_df)
    #with utils.timer("reduce memory"):
    #    train_df = utils.reduce_memory(train_df)
    #    test_df = utils.reduce_memory(test_df)
    with utils.timer("training"):
        feature_selection = True
        use_feats = [
            x for x in train_df.columns if not x in features.EXCEPT_FEATURES
        ]
        train_y = train_df[features.TARGET_COL]
        # training per type
        types = train_df["type"].unique()
        #types = train_df["new_type"].unique() #hinokkiタイプ
        oof = np.zeros(len(train_df))
        types_clfs = {}
        types_scores = {}
        use_feats_dict = {}
        for type_name in types:
            logging.info("----- training type == {} -----".format(type_name))
            type_idx = train_df.type == type_name
            if feature_selection:
                type_use_feats = feature_util.feature_select(
                    use_feats,
                    importance_df=pd.read_csv(
                        f"./importance/permu_importance{type_name}.csv",
                        names=["feature", "importance"]),
                    #importance_path = f"./importance/{type_name}feature_importance_summary.csv",
                    threshold=-0.05,
                    reverse=True,
                )
            else:
                type_use_feats = use_feats.copy()
            select_use_feats = features.select_type_feats(
                type_use_feats, type_name)
            use_feats_dict[type_name] = select_use_feats
            clfs, importances, val_score, oof_preds = \
                training.train(
                    train_df.loc[type_idx],
                    train_y.loc[type_idx],
                    use_feats = select_use_feats,
                    train_type = type_name,
                    #permutation=True,
                    permutation=False,
                    cat_cols = cat_cols,
                    log_dir = log_dir,
            )
            types_clfs[type_name] = clfs
            types_scores[type_name] = val_score
            utils.save_importances(importances_=importances,
                                   save_dir=log_dir,
                                   prefix=type_name)
            oof[type_idx] = oof_preds
            oof_df = pd.DataFrame({
                "id": train_df.loc[type_idx, "id"],
                "oof_preds": oof_preds
            })
            oof_df.to_csv(os.path.join(log_dir,
                                       "oof_{}.csv".format(type_name)),
                          index=False)
        for type_name, score in types_scores.items():
            logging.info("{0} : {1}".format(type_name, score))
        total_cv = competition_metric(train_df, oof)
        logging.info("TotalCV = {:.5f}".format(total_cv))
    del train_df
    gc.collect()
    # prediction
    with utils.timer("prediction"):
        prediction.predict(types_clfs,
                           test_df,
                           use_feats_dict=use_feats_dict,
                           val_score=total_cv,
                           log_dir=log_dir)
    return total_cv