Example #1
0
def load_from_run_id(run_id: str, to_rank: False):
    oof = Data.load(f'../output/pred/{run_id}-train.pkl')
    pred = Data.load(f'../output/pred/{run_id}-test.pkl')
    if to_rank:
        oof = rankdata(oof) / len(oof)
        pred = rankdata(pred) / len(pred)
    return (oof, pred)
Example #2
0
    def run_predict_cv(self) -> None:
        """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う

        あらかじめrun_train_cvを実行しておく必要がある
        """

        logger.info(f'{self.run_name} - start prediction cv')
        X_test = self.X_test
        preds = []

        show_feature_importance = 'LGBM' in str(self.model_cls)
        if show_feature_importance:
            feature_importances = pd.DataFrame()

        # 各foldのモデルで予測を行う
        for i_fold in range(self.cv.n_splits):
            logger.info(f'{self.run_name} - start prediction fold:{i_fold}')
            model = self.build_model(i_fold)
            model.load_model()
            pred = model.predict(X_test)
            preds.append(pred)
            logger.info(f'{self.run_name} - end prediction fold:{i_fold}')
            if show_feature_importance:
                feature_importances = pd.concat(
                    [feature_importances,
                     model.feature_importance(X_test)],
                    axis=0)

        # 予測の平均値を出力する
        pred_avg = np.mean(preds, axis=0)

        # 予測結果の保存
        Data.dump(pred_avg, f'../output/pred/{self.run_name}-test.pkl')

        logger.info(f'{self.run_name} - end prediction cv')

        # 特徴量の重要度
        if show_feature_importance:
            aggs = feature_importances.groupby('Feature').mean().sort_values(
                by="importance", ascending=False)
            cols = aggs[:200].index
            pd.DataFrame(aggs.index).to_csv(
                f'../output/importance/{self.run_name}-fi.csv', index=False)

            best_features = feature_importances.loc[
                feature_importances.Feature.isin(cols)]
            plt.figure(figsize=(14, 26))
            sns.barplot(x="importance",
                        y="Feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LightGBM Features (averaged over folds)')
            plt.tight_layout()
            plt.savefig(f'../output/importance/{self.run_name}-fi.png')
            plt.show()

            # mlflow
            mlflow.start_run(run_id=self.run_id)
            log_artifact(f'../output/importance/{self.run_name}-fi.png')
            mlflow.end_run()
Example #3
0
 def load(self):
     X_train = Data.load(
         join(self.output_dir, f"X_train_{self.run_name}.pkl"))
     y_train = Data.load(
         join(self.output_dir, f"y_train_{self.run_name}.pkl"))
     X_test = Data.load(join(self.output_dir,
                             f"X_test_{self.run_name}.pkl"))
     return X_train, X_test, y_train
Example #4
0
def make_predictions(data: list, weights: list):
    pred = 0
    for i, d in enumerate(data):
        if i < len(weights):
            pred += d[1] * weights[i]
        else:
            pred += d[1] * (1 - sum(weights))
    Data.dump(pred, f'../output/pred/{run_name}-test.pkl')
    return pred
Example #5
0
def load_from_run_id(run_id: str, to_rank: False):
    oof = Data.load(f'../output/pred/{run_id}-train.pkl')
    pred = Data.load(f'../output/pred/{run_id}-test.pkl')
    if run_id in ('run015'):
        oof = oof.reshape(-1, )
        pred = pred.reshape(-1, )
    if to_rank:
        oof = rankdata(oof) / len(oof)
        pred = rankdata(pred) / len(pred)
    return (oof, pred)
Example #6
0
def f(x):
    pred = 0
    for i, d in enumerate(data):
        if i < len(x):
            pred += d[0] * x[i]
        else:
            pred += d[0] * (1 - sum(x))
    score = np.sqrt(mean_squared_error(y_train, pred))
    Data.dump(pred, f'../output/pred/{run_name}-train.pkl')
    return score
Example #7
0
def f(x):
    pred = 0
    for i, d in enumerate(data):
        if i < len(x):
            pred += d[0] * x[i]
        else:
            pred += d[0] * (1 - sum(x))
    score = -1 * roc_auc_score(y_train, pred)
    Data.dump(pred, f'../output/pred/{run_name}-train.pkl')
    return score
Example #8
0
 def submission(self) -> None:
     pred = Data.load(f"../output/pred/{self.run_name}-test.pkl")
     sub = pd.read_csv(self.sample_submission)
     if self.evaluation_metric == "log_loss":
         sub[self.cols_definition["target_col"]] = np.argmax(pred, axis=1)
     else:
         oof = Data.load(f"../output/pred/{self.run_name}-train.pkl")
         oof = np.array([convert(v) for v in oof])
         pred = np.array([convert(v) for v in pred])
         sub[self.cols_definition["target_col"]] = pred
     sub[self.cols_definition["target_col"]] = sub[
         self.cols_definition["target_col"]].astype(float)
     sub.to_csv(f"../output/submissions/submission_{self.run_name}.csv",
                index=False)
Example #9
0
def load_oof_from_run_id(run_id: str, to_rank: False):
    oof = Data.load(f'../output/pred/{run_id}-train.pkl')
    if run_id in ('run091', 'run092', 'run097'):
        oof = oof.reshape(-1, )
    if to_rank:
        oof = rankdata(oof) / len(oof)
    return oof
Example #10
0
def load_pred_from_run_id(run_id: str, to_rank: False):
    pred = Data.load(f'../output/pred/{run_id}-test.pkl')
    if run_id in ('run091', 'run092', 'run097'):
        pred = pred.reshape(-1, )
    if to_rank:
        pred = rankdata(pred) / len(pred)
    return pred
Example #11
0
 def submission(self):
     pred = Data.load(f'../output/pred/{self.run_name}-test.pkl')
     sub = pd.read_csv(self.sample_submission)
     if self.advanced and 'predict_exp' in self.advanced:
         sub[self.cols_definition['target_col']] = np.expm1(pred)
     else:
         sub[self.cols_definition['target_col']] = pred
     sub.to_csv(f'../output/submissions/submission_{self.run_name}.csv', index=False)
Example #12
0
 def submission(self) -> None:
     pred = Data.load(f"../output/pred/{self.run_name}-test.pkl")
     sub = pd.read_csv(self.sample_submission)
     if self.advanced and "predict_exp" in self.advanced:
         sub[self.cols_definition["target_col"]] = np.exp(pred)
     else:
         sub[self.cols_definition["target_col"]] = pred
     sub.to_csv(f"../output/submissions/submission_{self.run_name}.csv",
                index=False)
Example #13
0
    def run_train_cv(self) -> None:
        """クロスバリデーションでの学習・評価を行う

        学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う
        """
        logger.info(f'{self.run_name} - start training cv')

        scores = []
        va_idxes = []
        preds = []

        # 各foldで学習を行う
        for i_fold in range(self.n_fold):
            # 学習を行う
            logger.info(f'{self.run_name} fold {i_fold} - start training')
            model, va_idx, va_pred, score = self.train_fold(i_fold)
            logger.info(
                f'{self.run_name} fold {i_fold} - end training - score {score}'
            )

            # モデルを保存する
            model.save_model()

            # 結果を保持する
            va_idxes.append(va_idx)
            scores.append(score)
            preds.append(va_pred)

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        order = np.argsort(va_idxes)
        preds = np.concatenate(preds, axis=0)
        preds = preds[order]

        logger.info(
            f'{self.run_name} - end training cv - score {np.mean(scores)}')

        # 予測結果の保存
        Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl')

        # 評価結果の保存
        logger.result_scores(self.run_name, scores)
Example #14
0
    def __init__(self, configs: Dict, cv) -> None:  # type: ignore
        self.exp_name = configs["exp_name"]
        self.run_name = configs["run_name"]
        self.run_id = None
        self.fe_name = configs["fe_name"]
        self.X_train = Data.load(f"../input/X_train_{configs['fe_name']}.pkl")
        self.y_train = Data.load(f"../input/y_train_{configs['fe_name']}.pkl")
        self.X_test = Data.load(f"../input/X_test_{configs['fe_name']}.pkl")
        self.evaluation_metric = configs["evaluation_metric"]
        self.params = configs["params"]
        self.cols_definition = configs["cols_definition"]
        self.cv = cv
        self.sample_submission = configs["data"]["sample_submission"]
        self.description = configs["description"]
        self.advanced = configs["advanced"] if "advanced" in configs else None

        if configs["model_name"] in models_map.keys():
            self.model_cls = models_map[configs["model_name"]]
        else:
            raise ValueError
Example #15
0
    def __init__(self, configs: dict, cv):
        self.exp_name = configs['exp_name']
        self.run_name = configs['run_name']
        self.run_id = None
        self.fe_name = configs['fe_name']
        self.X_train = Data.load(f"../input/X_train_{configs['fe_name']}.pkl")
        self.y_train = Data.load(f"../input/y_train_{configs['fe_name']}.pkl")
        self.X_test = Data.load(f"../input/X_test_{configs['fe_name']}.pkl")
        self.evaluation_metric = configs['evaluation_metric']
        self.params = configs['params']
        self.cols_definition = configs['cols_definition']
        self.cv = cv
        self.sample_submission = configs['data']['sample_submission']
        self.description = configs['description']
        self.advanced = configs['advanced'] if 'advanced' in configs else None

        if configs['model_name'] in models_map.keys():
            self.model_cls = models_map[configs['model_name']]
        else:
            raise ValueError
def save_as_pickle(
        train: pd.DataFrame,
        test: pd.DataFrame,
        target_col: str,
        exp_id: str,
        output_dir: str = '../input') -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Save X_train, X_test and y_train as pickel format

    Args:
        train (pd.DataFrame): train
        test (pd.DataFrame): test
        target_col (str): target column
        exp_id (str): experiment id
        output_dir (str, optional): output directory. Defaults to '../input'.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: train, test
    """
    X_train = train.drop(target_col, axis=1)
    y_train = train[target_col]
    if target_col in test.columns:
        X_test = test.drop(target_col, axis=1)
    else:
        X_test = test

    Data.dump(X_train, join(output_dir, f"X_train_{exp_id}.pkl"))
    Data.dump(y_train, join(output_dir, f"y_train_{exp_id}.pkl"))
    Data.dump(X_test, join(output_dir, f"X_test_{exp_id}.pkl"))
Example #17
0
def save_as_pickle(train: pd.DataFrame, test: pd.DataFrame,
                   col_definition: dict, option: dict):
    """
    col_definition: target_col
    option: exp_id
    """
    X_train = train.drop(col_definition['target_col'], axis=1)
    y_train = train[col_definition['target_col']]
    if col_definition['target_col'] in test.columns:
        X_test = test.drop(col_definition['target_col'], axis=1)
    else:
        X_test = test

    Data.dump(X_train, join('../input', f"X_train{option['exp_id']}.pkl"))
    Data.dump(y_train, join('../input', 'y_train.pkl'))
    Data.dump(X_test, join('../input', f"X_test{option['exp_id']}.pkl"))
Example #18
0
def save_as_pickle(train: pd.DataFrame, test: pd.DataFrame,
                   col_definition: dict, option: dict):
    """
    col_definition: target_col
    option: output_dir, exp_id
    """
    X_train = train.drop(col_definition['target_col'], axis=1)
    y_train = train[col_definition['target_col']]
    if col_definition['target_col'] in test.columns:
        X_test = test.drop(col_definition['target_col'], axis=1)
    else:
        X_test = test

    Data.dump(X_train,
              join(option['output_dir'], f"X_train_{option['exp_id']}.pkl"))
    Data.dump(y_train,
              join(option['output_dir'], f"y_train_{option['exp_id']}.pkl"))
    Data.dump(X_test,
              join(option['output_dir'], f"X_test_{option['exp_id']}.pkl"))
Example #19
0
import japanize_matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

from ayniy.utils import Data

if __name__ == '__main__':
    oof = Data.load('../output/pred/run003-train.pkl')

    train_player = pd.read_csv('../input/train_player.csv')
    train_pitch = pd.read_csv('../input/train_pitch.csv')
    train_pitch = train_pitch[train_pitch['試合種別詳細'] != 'パ・リーグ公式戦'].reset_index(
        drop=True)

    # 投手情報の紐付け
    train = pd.merge(train_pitch,
                     train_player,
                     left_on=['年度', '投手ID'],
                     right_on=['年度', '選手ID'],
                     how='inner')

    # 打者情報の紐付け
    train = pd.merge(train,
                     train_player,
                     left_on=['年度', '打者ID'],
                     right_on=['年度', '選手ID'],
                     how='inner',
                     suffixes=('_p', '_b'))

    X_train, _, _, _ = train_test_split(train.drop('試合種別詳細', axis=1),
Example #20
0
        else:
            pred += d[1] * (1 - sum(weights))
    Data.dump(pred, f'../output/pred/{run_name}-test.pkl')
    return pred


def make_submission(pred, run_name: str):
    sub = pd.read_csv('../input/solafune-light/UploadFileTemplate.csv')
    sub['LandPrice'] = np.expm1(pred)
    sub.to_csv(f'../output/submissions/submission_{run_name}.csv', index=False)


run_ids = [
    'run004',
    'run005',
]
run_name = 'weight001'

if __name__ == '__main__':
    y_train = Data.load('../input/pickle/y_train_fe000.pkl')
    data = [load_from_run_id(ri, to_rank=False) for ri in run_ids]

    for d in data:
        print(np.sqrt(mean_squared_error(y_train, d[0])))

    init_state = [round(1 / len(data), 3) for _ in range(len(data) - 1)]
    result = minimize(f, init_state, method='Nelder-Mead')
    print('optimized CV: ', result['fun'])
    print('w: ', result['x'])
    make_submission(make_predictions(data, result['x']), run_name)
Example #21
0
    def train_fold(self, i_fold: int) -> Tuple[Any, Any, Any, Any]:
        """クロスバリデーションでのfoldを指定して学習・評価を行う

        他のメソッドから呼び出すほか、単体でも確認やパラメータ調整に用いる

        :param i_fold: foldの番号(すべてのときには'all'とする)
        :return: (モデルのインスタンス、レコードのインデックス、予測値、評価によるスコア)のタプル
        """
        # 学習データの読込
        X_train = self.X_train
        y_train = self.y_train

        # 残差の設定
        if self.advanced and "ResRunner" in self.advanced:
            oof = Data.load(self.advanced["ResRunner"]["oof"])
            X_train["res"] = (y_train - oof).abs()

        # 学習データ・バリデーションデータをセットする
        tr_idx, va_idx = self.load_index_fold(i_fold)
        X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
        X_val, y_val = X_train.iloc[va_idx], y_train.iloc[va_idx]

        # 残差でダウンサンプリング
        if self.advanced and "ResRunner" in self.advanced:
            X_tr = X_tr.loc[(
                X_tr["res"] <
                self.advanced["ResRunner"]["res_threshold"]).values]
            y_tr = y_tr.loc[(
                X_tr["res"] <
                self.advanced["ResRunner"]["res_threshold"]).values]
            print(X_tr.shape)
            X_tr.drop("res", axis=1, inplace=True)
            X_val.drop("res", axis=1, inplace=True)

        # Pseudo Lebeling
        if self.advanced and "PseudoRunner" in self.advanced:
            y_test_pred = Data.load(
                self.advanced["PseudoRunner"]["y_test_pred"])
            if "pl_threshold" in self.advanced["PseudoRunner"]:
                X_add = self.X_test.loc[
                    (y_test_pred < self.
                     advanced["PseudoRunner"]["pl_threshold"])
                    | (y_test_pred > 1 -
                       self.advanced["PseudoRunner"]["pl_threshold"])]
                y_add = pd.DataFrame(y_test_pred).loc[
                    (y_test_pred < self.
                     advanced["PseudoRunner"]["pl_threshold"])
                    | (y_test_pred > 1 -
                       self.advanced["PseudoRunner"]["pl_threshold"])]
                y_add = pd.DataFrame(
                    ([1 if ya > 0.5 else 0 for ya in y_add[0]]))
            elif "pl_threshold_neg" in self.advanced["PseudoRunner"]:
                X_add = self.X_test.loc[
                    (y_test_pred < self.
                     advanced["PseudoRunner"]["pl_threshold_neg"])
                    | (y_test_pred > self.
                       advanced["PseudoRunner"]["pl_threshold_pos"])]
                y_add = pd.DataFrame(y_test_pred).loc[
                    (y_test_pred < self.
                     advanced["PseudoRunner"]["pl_threshold_neg"])
                    | (y_test_pred > self.
                       advanced["PseudoRunner"]["pl_threshold_pos"])]
                y_add = pd.DataFrame(
                    ([1 if ya > 0.5 else 0 for ya in y_add[0]]))
            else:
                X_add = self.X_test
                y_add = pd.DataFrame(y_test_pred)
            print(f"added X_test: {len(X_add)}")
            X_tr = pd.concat([X_tr, X_add])
            y_tr = pd.concat([y_tr, y_add])

        # 学習を行う
        model = self.build_model(i_fold)
        model.train(X_tr, y_tr, X_val, y_val, self.X_test)  # type: ignore

        # バリデーションデータへの予測・評価を行う
        pred_val = model.predict(X_val)

        if self.evaluation_metric == "log_loss":
            score = log_loss(y_val, pred_val, eps=1e-15, normalize=True)
        elif self.evaluation_metric == "mean_absolute_error":
            score = mean_absolute_error(y_val, pred_val)
        elif self.evaluation_metric == "rmse":
            score = np.sqrt(mean_squared_error(y_val, pred_val))
        elif self.evaluation_metric == "auc":
            score = roc_auc_score(y_val, pred_val)
        elif self.evaluation_metric == "prauc":
            score = average_precision_score(y_val, pred_val)

        # モデル、インデックス、予測値、評価を返す
        return model, va_idx, pred_val, score
Example #22
0
    def run_train_cv(self) -> None:
        """クロスバリデーションでの学習・評価を行う

        学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う
        """
        # mlflow
        mlflow.set_experiment(self.exp_name)
        mlflow.start_run(run_name=self.run_name)
        logger.info(f"{self.run_name} - start training cv")

        scores = []
        va_idxes = []
        preds = []

        # Adversarial validation
        if self.advanced and "adversarial_validation" in self.advanced:
            X_train = self.X_train
            X_test = self.X_test
            X_train["target"] = 0
            X_test["target"] = 1
            X_train = pd.concat([X_train, X_test],
                                sort=False).reset_index(drop=True)
            y_train = X_train["target"]
            X_train.drop("target", axis=1, inplace=True)
            X_test.drop("target", axis=1, inplace=True)
            self.X_train = X_train
            self.y_train = y_train

        # 各foldで学習を行う
        for i_fold in range(self.cv.n_splits):
            # 学習を行う
            logger.info(f"{self.run_name} fold {i_fold} - start training")
            model, va_idx, va_pred, score = self.train_fold(i_fold)
            logger.info(
                f"{self.run_name} fold {i_fold} - end training - score {score}"
            )

            # モデルを保存する
            model.save_model()

            # 結果を保持する
            va_idxes.append(va_idx)
            scores.append(score)
            preds.append(va_pred)

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        order = np.argsort(va_idxes)
        preds = np.concatenate(preds, axis=0)
        preds = preds[order]

        if self.evaluation_metric == "log_loss":
            cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True)
        elif self.evaluation_metric == "mean_absolute_error":
            cv_score = mean_absolute_error(self.y_train, preds)
        elif self.evaluation_metric == "rmse":
            cv_score = np.sqrt(mean_squared_error(self.y_train, preds))
        elif self.evaluation_metric == "auc":
            cv_score = roc_auc_score(self.y_train, preds)
        elif self.evaluation_metric == "prauc":
            cv_score = average_precision_score(self.y_train, preds)

        logger.info(f"{self.run_name} - end training cv - score {cv_score}")

        # 予測結果の保存
        Data.dump(preds, f"../output/pred/{self.run_name}-train.pkl")

        # mlflow
        self.run_id = mlflow.active_run().info.run_id
        log_param("model_name", self.model_cls.__class__.__name__)
        log_param("fe_name", self.fe_name)
        log_param("train_params", self.params)
        log_param("cv_strategy", str(self.cv))
        log_param("evaluation_metric", self.evaluation_metric)
        log_metric("cv_score", cv_score)
        log_param(
            "fold_scores",
            dict(
                zip([f"fold_{i}" for i in range(len(scores))],
                    [round(s, 4) for s in scores])),
        )
        log_param("cols_definition", self.cols_definition)
        log_param("description", self.description)
        mlflow.end_run()
Example #23
0
            lbl = OrdinalEncoder(dtype='int')
            train[col] = lbl.fit_transform(train[col].astype('str').fillna('-1').values.reshape(-1, 1))
            test[col] = lbl.transform(test[col].astype('str').fillna('-1').values.reshape(-1, 1))
        temp = pd.concat([train[[col]], test[[col]]], axis=0)
        temp_mapping = temp.groupby(col).size() / len(temp)
        temp['enc'] = temp[col].map(temp_mapping)
        temp['enc'] = stats.rankdata(temp['enc'])
        temp = temp.reset_index(drop=True)
        train[f'rank_frqenc_{col}'] = temp[['enc']].values[:train.shape[0]]
        test[f'rank_frqenc_{col}'] = temp[['enc']].values[train.shape[0]:]
        test[col] = test[col].astype('category')
        train[col] = train[col].astype('category')

    drop_cols = list(set(drop_cols))
    print(len(drop_cols))
    train = train.drop(drop_cols, axis=1)
    test = test.drop(drop_cols, axis=1)

    train = reduce_mem_usage(train)
    test = reduce_mem_usage(test)
    gc.collect()
    print(train.shape, test.shape)

    test['encounter_id'] = test_id
    test = test.sort_values('encounter_id').reset_index(drop=True)

    fe_name = 'fe_siavrez'
    Data.dump(train, f'../input/pickle/X_train_{fe_name}.pkl')
    # Data.dump(y, f'../input/pickle/y_train_{fe_name}.pkl')
    Data.dump(test.drop('encounter_id', axis=1), f'../input/pickle/X_test_{fe_name}.pkl')
Example #24
0
    def run_train_cv(self) -> None:
        """クロスバリデーションでの学習・評価を行う

        学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う
        """
        # mlflow
        mlflow.set_experiment(self.exp_name)
        mlflow.start_run(run_name=self.run_name)
        logger.info(f'{self.run_name} - start training cv')

        scores = []
        va_idxes = []
        preds = []

        # Adversarial validation
        if self.advanced and 'adversarial_validation' in self.advanced:
            X_train = self.X_train
            X_test = self.X_test
            X_train['target'] = 0
            X_test['target'] = 1
            X_train = pd.concat([X_train, X_test],
                                sort=False).reset_index(drop=True)
            y_train = X_train['target']
            X_train.drop('target', axis=1, inplace=True)
            X_test.drop('target', axis=1, inplace=True)
            self.X_train = X_train
            self.y_train = y_train

        # 各foldで学習を行う
        for i_fold in range(self.cv.n_splits):
            # 学習を行う
            logger.info(f'{self.run_name} fold {i_fold} - start training')
            model, va_idx, va_pred, score = self.train_fold(i_fold)
            logger.info(
                f'{self.run_name} fold {i_fold} - end training - score {score}'
            )

            # モデルを保存する
            model.save_model()

            # 結果を保持する
            va_idxes.append(va_idx)
            scores.append(score)
            preds.append(va_pred)

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        order = np.argsort(va_idxes)
        preds = np.concatenate(preds, axis=0)
        preds = preds[order]

        if self.evaluation_metric == 'log_loss':
            cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True)
        elif self.evaluation_metric == 'mean_absolute_error':
            cv_score = mean_absolute_error(self.y_train, preds)
        elif self.evaluation_metric == 'rmse':
            cv_score = np.sqrt(mean_squared_error(self.y_train, preds))
        elif self.evaluation_metric == 'auc':
            cv_score = roc_auc_score(self.y_train, preds)
        elif self.evaluation_metric == 'prauc':
            cv_score = average_precision_score(self.y_train, preds)

        logger.info(f'{self.run_name} - end training cv - score {cv_score}')

        # 予測結果の保存
        Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl')

        # mlflow
        self.run_id = mlflow.active_run().info.run_id
        log_param('model_name', str(self.model_cls).split('.')[-1][:-2])
        log_param('fe_name', self.fe_name)
        log_param('train_params', self.params)
        log_param('cv_strategy', str(self.cv))
        log_param('evaluation_metric', self.evaluation_metric)
        log_metric('cv_score', cv_score)
        log_param(
            'fold_scores',
            dict(
                zip([f'fold_{i}' for i in range(len(scores))],
                    [round(s, 4) for s in scores])))
        log_param('cols_definition', self.cols_definition)
        log_param('description', self.description)
        mlflow.end_run()
Example #25
0
 def load_model(self) -> None:
     model_path = os.path.join("../output/model", f"{self.run_fold_name}.model")
     self.model = Data.load(model_path)
Example #26
0
import numpy as np
import pandas as pd
import yaml

from ayniy.model.model_cat import ModelCatRegressor
from ayniy.model.runner import Runner
from ayniy.utils import Data


X_train = Data.load('../input/X_train_00.pkl')
y_train = Data.load('../input/y_train.pkl')
X_test = Data.load('../input/X_test_00.pkl')

X_train.drop(['fiscal_year'], axis=1, inplace=True)
X_test.drop(['fiscal_year'], axis=1, inplace=True)
y_train = np.log(np.sqrt(y_train))

f = open("configs/fe_00.yml", "r+")
configs = yaml.load(f)
categorical_cols = configs['cols_definition']['categorical_col']

params_cat = {
    'depth': 6,
    'learning_rate': 0.1,
    'iterations': 10000,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 777,
    'allow_writing_files': False,
    'task_type': "CPU",
    'early_stopping_rounds': 50
Example #27
0
 def save_model(self) -> None:
     model_path = os.path.join("../output/model", f"{self.run_fold_name}.model")
     os.makedirs(os.path.dirname(model_path), exist_ok=True)
     Data.dump(self.model, model_path)
                'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
            },
        ],
        nunique_dict=[
            {
                'key': ['Sex'],
                'var': ['SibSp'],
                'agg': ['nunique']
            },
            {
                'key': ['Sex'],
                'var': ['Cabin'],
                'agg': ['nunique']
            },
        ])

    print(X_train.shape, X_test.shape)
    unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
        X_train, X_test, escape_col=categorical_cols, threshold=0.99)
    X_train.drop(unique_cols + duplicated_cols + high_corr_cols,
                 axis=1,
                 inplace=True)
    X_test.drop(unique_cols + duplicated_cols + high_corr_cols,
                axis=1,
                inplace=True)

    print(X_train.shape, X_test.shape)
    Data.dump(X_train, output_dir + 'X_train_fe000.pkl')
    Data.dump(X_test, output_dir + 'X_test_fe000.pkl')
    Data.dump(y_train, output_dir + 'y_train_fe000.pkl')
Example #29
0
ython select_features.py --n 100
"""
import argparse

import pandas as pd

from ayniy.utils import Data

parser = argparse.ArgumentParser()
parser.add_argument('--n')
args = parser.parse_args()

fe_id = 'fe005'
run_id = 'run046'
N_FEATURES = int(args.n)
fe_name = f'fe005_top{N_FEATURES}'

X_train = Data.load(f'../input/X_train_{fe_id}.pkl')
y_train = Data.load(f'../input/y_train_{fe_id}.pkl')
X_test = Data.load(f'../input/X_test_{fe_id}.pkl')

fi = pd.read_csv(
    f'../output/importance/{run_id}-fi.csv')['Feature'][:N_FEATURES]

X_train = X_train[fi]
X_test = X_test[fi]

Data.dump(X_train, f'../input/X_train_{fe_name}.pkl')
Data.dump(y_train, f'../input/y_train_{fe_name}.pkl')
Data.dump(X_test, f'../input/X_test_{fe_name}.pkl')
Example #30
0
 def save_model(self) -> None:
     model_path = os.path.join("../output/model",
                               f"{self.run_fold_name}.model")
     os.makedirs(os.path.dirname(model_path), exist_ok=True)
     # best_ntree_limitが消えるのを防ぐため、pickleで保存することとした
     Data.dump(self.model, model_path)