예제 #1
0
    def run_predict_cv(self) -> None:
        """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う

        あらかじめrun_train_cvを実行しておく必要がある
        """

        logger.info(f'{self.run_name} - start prediction cv')
        X_test = self.X_test
        preds = []

        show_feature_importance = 'LGBM' in str(self.model_cls)
        if show_feature_importance:
            feature_importances = pd.DataFrame()

        # 各foldのモデルで予測を行う
        for i_fold in range(self.cv.n_splits):
            logger.info(f'{self.run_name} - start prediction fold:{i_fold}')
            model = self.build_model(i_fold)
            model.load_model()
            pred = model.predict(X_test)
            preds.append(pred)
            logger.info(f'{self.run_name} - end prediction fold:{i_fold}')
            if show_feature_importance:
                feature_importances = pd.concat(
                    [feature_importances,
                     model.feature_importance(X_test)],
                    axis=0)

        # 予測の平均値を出力する
        pred_avg = np.mean(preds, axis=0)

        # 予測結果の保存
        Data.dump(pred_avg, f'../output/pred/{self.run_name}-test.pkl')

        logger.info(f'{self.run_name} - end prediction cv')

        # 特徴量の重要度
        if show_feature_importance:
            aggs = feature_importances.groupby('Feature').mean().sort_values(
                by="importance", ascending=False)
            cols = aggs[:200].index
            pd.DataFrame(aggs.index).to_csv(
                f'../output/importance/{self.run_name}-fi.csv', index=False)

            best_features = feature_importances.loc[
                feature_importances.Feature.isin(cols)]
            plt.figure(figsize=(14, 26))
            sns.barplot(x="importance",
                        y="Feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LightGBM Features (averaged over folds)')
            plt.tight_layout()
            plt.savefig(f'../output/importance/{self.run_name}-fi.png')
            plt.show()

            # mlflow
            mlflow.start_run(run_id=self.run_id)
            log_artifact(f'../output/importance/{self.run_name}-fi.png')
            mlflow.end_run()
예제 #2
0
def make_predictions(data: list, weights: list):
    pred = 0
    for i, d in enumerate(data):
        if i < len(weights):
            pred += d[1] * weights[i]
        else:
            pred += d[1] * (1 - sum(weights))
    Data.dump(pred, f'../output/pred/{run_name}-test.pkl')
    return pred
예제 #3
0
def f(x):
    pred = 0
    for i, d in enumerate(data):
        if i < len(x):
            pred += d[0] * x[i]
        else:
            pred += d[0] * (1 - sum(x))
    score = -1 * roc_auc_score(y_train, pred)
    Data.dump(pred, f'../output/pred/{run_name}-train.pkl')
    return score
예제 #4
0
def f(x):
    pred = 0
    for i, d in enumerate(data):
        if i < len(x):
            pred += d[0] * x[i]
        else:
            pred += d[0] * (1 - sum(x))
    score = np.sqrt(mean_squared_error(y_train, pred))
    Data.dump(pred, f'../output/pred/{run_name}-train.pkl')
    return score
예제 #5
0
    def run_train_cv(self) -> None:
        """クロスバリデーションでの学習・評価を行う

        学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う
        """
        logger.info(f'{self.run_name} - start training cv')

        scores = []
        va_idxes = []
        preds = []

        # 各foldで学習を行う
        for i_fold in range(self.n_fold):
            # 学習を行う
            logger.info(f'{self.run_name} fold {i_fold} - start training')
            model, va_idx, va_pred, score = self.train_fold(i_fold)
            logger.info(
                f'{self.run_name} fold {i_fold} - end training - score {score}'
            )

            # モデルを保存する
            model.save_model()

            # 結果を保持する
            va_idxes.append(va_idx)
            scores.append(score)
            preds.append(va_pred)

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        order = np.argsort(va_idxes)
        preds = np.concatenate(preds, axis=0)
        preds = preds[order]

        logger.info(
            f'{self.run_name} - end training cv - score {np.mean(scores)}')

        # 予測結果の保存
        Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl')

        # 評価結果の保存
        logger.result_scores(self.run_name, scores)
예제 #6
0
def save_as_pickle(
        train: pd.DataFrame,
        test: pd.DataFrame,
        target_col: str,
        exp_id: str,
        output_dir: str = '../input') -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Save X_train, X_test and y_train as pickel format

    Args:
        train (pd.DataFrame): train
        test (pd.DataFrame): test
        target_col (str): target column
        exp_id (str): experiment id
        output_dir (str, optional): output directory. Defaults to '../input'.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: train, test
    """
    X_train = train.drop(target_col, axis=1)
    y_train = train[target_col]
    if target_col in test.columns:
        X_test = test.drop(target_col, axis=1)
    else:
        X_test = test

    Data.dump(X_train, join(output_dir, f"X_train_{exp_id}.pkl"))
    Data.dump(y_train, join(output_dir, f"y_train_{exp_id}.pkl"))
    Data.dump(X_test, join(output_dir, f"X_test_{exp_id}.pkl"))
예제 #7
0
def save_as_pickle(train: pd.DataFrame, test: pd.DataFrame,
                   col_definition: dict, option: dict):
    """
    col_definition: target_col
    option: exp_id
    """
    X_train = train.drop(col_definition['target_col'], axis=1)
    y_train = train[col_definition['target_col']]
    if col_definition['target_col'] in test.columns:
        X_test = test.drop(col_definition['target_col'], axis=1)
    else:
        X_test = test

    Data.dump(X_train, join('../input', f"X_train{option['exp_id']}.pkl"))
    Data.dump(y_train, join('../input', 'y_train.pkl'))
    Data.dump(X_test, join('../input', f"X_test{option['exp_id']}.pkl"))
예제 #8
0
def save_as_pickle(train: pd.DataFrame, test: pd.DataFrame,
                   col_definition: dict, option: dict):
    """
    col_definition: target_col
    option: output_dir, exp_id
    """
    X_train = train.drop(col_definition['target_col'], axis=1)
    y_train = train[col_definition['target_col']]
    if col_definition['target_col'] in test.columns:
        X_test = test.drop(col_definition['target_col'], axis=1)
    else:
        X_test = test

    Data.dump(X_train,
              join(option['output_dir'], f"X_train_{option['exp_id']}.pkl"))
    Data.dump(y_train,
              join(option['output_dir'], f"y_train_{option['exp_id']}.pkl"))
    Data.dump(X_test,
              join(option['output_dir'], f"X_test_{option['exp_id']}.pkl"))
예제 #9
0
 def save_model(self) -> None:
     model_path = os.path.join("../output/model",
                               f"{self.run_fold_name}.model")
     os.makedirs(os.path.dirname(model_path), exist_ok=True)
     # best_ntree_limitが消えるのを防ぐため、pickleで保存することとした
     Data.dump(self.model, model_path)
예제 #10
0
                'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
            },
        ],
        nunique_dict=[
            {
                'key': ['Sex'],
                'var': ['SibSp'],
                'agg': ['nunique']
            },
            {
                'key': ['Sex'],
                'var': ['Cabin'],
                'agg': ['nunique']
            },
        ])

    print(X_train.shape, X_test.shape)
    unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
        X_train, X_test, escape_col=categorical_cols, threshold=0.99)
    X_train.drop(unique_cols + duplicated_cols + high_corr_cols,
                 axis=1,
                 inplace=True)
    X_test.drop(unique_cols + duplicated_cols + high_corr_cols,
                axis=1,
                inplace=True)

    print(X_train.shape, X_test.shape)
    Data.dump(X_train, output_dir + 'X_train_fe000.pkl')
    Data.dump(X_test, output_dir + 'X_test_fe000.pkl')
    Data.dump(y_train, output_dir + 'y_train_fe000.pkl')
예제 #11
0
import pandas as pd

from ayniy.utils import Data

INPUT_DIR = "../input/data_v2/"

if __name__ == "__main__":
    train = pd.read_csv(INPUT_DIR + "train.csv")
    test = pd.read_csv(INPUT_DIR + "test.csv")
    Data.dump(train.drop("score", axis=1), "../input/X_train_fe000.pkl")
    Data.dump(train["score"], "../input/y_train_fe000.pkl")
    Data.dump(test, "../input/X_test_fe000.pkl")
예제 #12
0
 def save_model(self) -> None:
     model_path = os.path.join("../output/model", f"{self.run_fold_name}.model")
     os.makedirs(os.path.dirname(model_path), exist_ok=True)
     Data.dump(self.model, model_path)
예제 #13
0
 def save_model(self):
     model_path = os.path.join('../output/model', f'{self.run_fold_name}.model')
     os.makedirs(os.path.dirname(model_path), exist_ok=True)
     Data.dump(self.model, model_path)
예제 #14
0
    fe_id = "fe000"
    fe_name = f"{fe_id}_nn_small"

    X_train = Data.load(f"../input/pickle/X_train_{fe_id}.pkl")
    y_train = Data.load(f"../input/pickle/y_train_{fe_id}.pkl")
    X_test = Data.load(f"../input/pickle/X_test_{fe_id}.pkl")

    del_col = []
    for c in X_train.columns:
        X_train[c].fillna(-1, inplace=True)
        X_test[c].fillna(-1, inplace=True)
        try:
            prep = StandardScaler()
            X_train[c] = prep.fit_transform(X_train[[c]])
            X_test[c] = prep.transform(X_test[[c]])
        except ValueError:
            del_col.append(c)
    print(del_col)
    print(len(del_col))
    X_train.drop(del_col, axis=1, inplace=True)
    X_test.drop(del_col, axis=1, inplace=True)
    print(X_train.shape)

    X_train = X_train.loc[:100]
    y_train = y_train.loc[:100]

    Data.dump(X_train, f"../input/pickle/X_train_{fe_name}.pkl")
    Data.dump(y_train, f"../input/pickle/y_train_{fe_name}.pkl")
    Data.dump(X_test, f"../input/pickle/X_test_{fe_name}.pkl")
예제 #15
0
파일: concat.py 프로젝트: upura/atma-comp5
fe005_tr = Data.load('../input/X_train_fe005.pkl')
fe005_te = Data.load('../input/X_test_fe005.pkl')

# fe001_tr = Data.load('../input/X_train_fe001.pkl')
# fe001_te = Data.load('../input/X_test_fe001.pkl')
# top10_tr = Data.load('../input/X_train_fe004_top10.pkl')
# top10_te = Data.load('../input/X_test_fe004_top10.pkl')

# top10_tr, top10_te = standerize(top10_tr, top10_te, {'encode_col': top10_tr.columns})
# print(top10_tr.head())

train_fitting_ef_add = pd.concat([fe005_tr, add_tr], axis=1)
test_fitting_ef_add = pd.concat([fe005_te, add_te], axis=1)

fe_name = 'fe005_add'
Data.dump(train_fitting_ef_add, f'../input/X_train_{fe_name}.pkl')
Data.dump(test_fitting_ef_add, f'../input/X_test_{fe_name}.pkl')

# train_fitting_ef.to_csv('../input/train_fitting_ef.csv', index=False)
# test_fitting_ef.to_csv('../input/test_fitting_ef.csv', index=False)

# spec_train = []
# for i, filename in enumerate(train['spectrum_filename'].values):
#     spec_df = pd.read_csv(f'../input/spectrum_raw/{filename}', sep='\t', header=None)
#     spec_train.append(spec_df[1])

# spec_tr = pd.concat(spec_train, axis=1)
# spec_tr = spec_tr.T
# spec_tr.columns = [f'spec_{i}' for i in range(spec_tr.shape[1])]
# spec_tr = spec_tr.reset_index(drop=True)
예제 #16
0
if __name__ == "__main__":

    train = pd.read_csv(INPUT_DIR + "train.csv")
    train = split_user_id(train)
    train = add_user_ages(train)
    train = add_user_purposes(train)
    train = add_user_vecs(train)
    train = add_user_strengths(train)
    train = add_user_works(train)
    train = add_user_skills(train)
    train = add_user_educations(train)
    pd.Series(train.columns).to_csv("../input/col_names.csv", index=False)

    Data.dump(
        train.drop(DELETE_COLS + ["score"], axis=1),
        "../input/X_train_fe002.pkl",
    )
    Data.dump(train["score"], "../input/y_train_fe002.pkl")
    del train
    gc.collect()

    test = pd.read_csv(INPUT_DIR + "test.csv")
    test = split_user_id(test)
    test = add_user_ages(test)
    test = add_user_purposes(test)
    test = add_user_vecs(test)
    test = add_user_strengths(test)
    test = add_user_works(test)
    test = add_user_skills(test)
    test = add_user_educations(test)
예제 #17
0
    def run_train_cv(self) -> None:
        """クロスバリデーションでの学習・評価を行う

        学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う
        """
        # mlflow
        mlflow.set_experiment(self.exp_name)
        mlflow.start_run(run_name=self.run_name)
        logger.info(f"{self.run_name} - start training cv")

        scores = []
        va_idxes = []
        preds = []

        # Adversarial validation
        if self.advanced and "adversarial_validation" in self.advanced:
            X_train = self.X_train
            X_test = self.X_test
            X_train["target"] = 0
            X_test["target"] = 1
            X_train = pd.concat([X_train, X_test],
                                sort=False).reset_index(drop=True)
            y_train = X_train["target"]
            X_train.drop("target", axis=1, inplace=True)
            X_test.drop("target", axis=1, inplace=True)
            self.X_train = X_train
            self.y_train = y_train

        # 各foldで学習を行う
        for i_fold in range(self.cv.n_splits):
            # 学習を行う
            logger.info(f"{self.run_name} fold {i_fold} - start training")
            model, va_idx, va_pred, score = self.train_fold(i_fold)
            logger.info(
                f"{self.run_name} fold {i_fold} - end training - score {score}"
            )

            # モデルを保存する
            model.save_model()

            # 結果を保持する
            va_idxes.append(va_idx)
            scores.append(score)
            preds.append(va_pred)

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        order = np.argsort(va_idxes)
        preds = np.concatenate(preds, axis=0)
        preds = preds[order]

        if self.evaluation_metric == "log_loss":
            cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True)
        elif self.evaluation_metric == "mean_absolute_error":
            cv_score = mean_absolute_error(self.y_train, preds)
        elif self.evaluation_metric == "rmse":
            cv_score = np.sqrt(mean_squared_error(self.y_train, preds))
        elif self.evaluation_metric == "auc":
            cv_score = roc_auc_score(self.y_train, preds)
        elif self.evaluation_metric == "prauc":
            cv_score = average_precision_score(self.y_train, preds)

        logger.info(f"{self.run_name} - end training cv - score {cv_score}")

        # 予測結果の保存
        Data.dump(preds, f"../output/pred/{self.run_name}-train.pkl")

        # mlflow
        self.run_id = mlflow.active_run().info.run_id
        log_param("model_name", self.model_cls.__class__.__name__)
        log_param("fe_name", self.fe_name)
        log_param("train_params", self.params)
        log_param("cv_strategy", str(self.cv))
        log_param("evaluation_metric", self.evaluation_metric)
        log_metric("cv_score", cv_score)
        log_param(
            "fold_scores",
            dict(
                zip([f"fold_{i}" for i in range(len(scores))],
                    [round(s, 4) for s in scores])),
        )
        log_param("cols_definition", self.cols_definition)
        log_param("description", self.description)
        mlflow.end_run()
예제 #18
0
import pandas as pd

from ayniy.utils import Data

if __name__ == '__main__':
    ef_tr = pd.read_csv('../input/efficient_tr.csv')
    ef_te = pd.read_csv('../input/efficient_te.csv')

    fe001_top500_tr = Data.load('../input/X_train_fe001_top500.pkl')
    fe001_top500_te = Data.load('../input/X_test_fe001_top500.pkl')

    train_tag = pd.concat([fe001_top500_tr, ef_tr], axis=1)
    test_tag = pd.concat([fe001_top500_te, ef_te], axis=1)

    fe_name = 'fe001_top500_ef'
    Data.dump(train_tag, f'../input/X_train_{fe_name}.pkl')
    Data.dump(test_tag, f'../input/X_test_{fe_name}.pkl')
예제 #19
0
            lbl = OrdinalEncoder(dtype='int')
            train[col] = lbl.fit_transform(train[col].astype('str').fillna('-1').values.reshape(-1, 1))
            test[col] = lbl.transform(test[col].astype('str').fillna('-1').values.reshape(-1, 1))
        temp = pd.concat([train[[col]], test[[col]]], axis=0)
        temp_mapping = temp.groupby(col).size() / len(temp)
        temp['enc'] = temp[col].map(temp_mapping)
        temp['enc'] = stats.rankdata(temp['enc'])
        temp = temp.reset_index(drop=True)
        train[f'rank_frqenc_{col}'] = temp[['enc']].values[:train.shape[0]]
        test[f'rank_frqenc_{col}'] = temp[['enc']].values[train.shape[0]:]
        test[col] = test[col].astype('category')
        train[col] = train[col].astype('category')

    drop_cols = list(set(drop_cols))
    print(len(drop_cols))
    train = train.drop(drop_cols, axis=1)
    test = test.drop(drop_cols, axis=1)

    train = reduce_mem_usage(train)
    test = reduce_mem_usage(test)
    gc.collect()
    print(train.shape, test.shape)

    test['encounter_id'] = test_id
    test = test.sort_values('encounter_id').reset_index(drop=True)

    fe_name = 'fe_siavrez'
    Data.dump(train, f'../input/pickle/X_train_{fe_name}.pkl')
    # Data.dump(y, f'../input/pickle/y_train_{fe_name}.pkl')
    Data.dump(test.drop('encounter_id', axis=1), f'../input/pickle/X_test_{fe_name}.pkl')
예제 #20
0
    def run_train_cv(self) -> None:
        """クロスバリデーションでの学習・評価を行う

        学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う
        """
        # mlflow
        mlflow.set_experiment(self.exp_name)
        mlflow.start_run(run_name=self.run_name)
        logger.info(f'{self.run_name} - start training cv')

        scores = []
        va_idxes = []
        preds = []

        # Adversarial validation
        if self.advanced and 'adversarial_validation' in self.advanced:
            X_train = self.X_train
            X_test = self.X_test
            X_train['target'] = 0
            X_test['target'] = 1
            X_train = pd.concat([X_train, X_test],
                                sort=False).reset_index(drop=True)
            y_train = X_train['target']
            X_train.drop('target', axis=1, inplace=True)
            X_test.drop('target', axis=1, inplace=True)
            self.X_train = X_train
            self.y_train = y_train

        # 各foldで学習を行う
        for i_fold in range(self.cv.n_splits):
            # 学習を行う
            logger.info(f'{self.run_name} fold {i_fold} - start training')
            model, va_idx, va_pred, score = self.train_fold(i_fold)
            logger.info(
                f'{self.run_name} fold {i_fold} - end training - score {score}'
            )

            # モデルを保存する
            model.save_model()

            # 結果を保持する
            va_idxes.append(va_idx)
            scores.append(score)
            preds.append(va_pred)

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        order = np.argsort(va_idxes)
        preds = np.concatenate(preds, axis=0)
        preds = preds[order]

        if self.evaluation_metric == 'log_loss':
            cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True)
        elif self.evaluation_metric == 'mean_absolute_error':
            cv_score = mean_absolute_error(self.y_train, preds)
        elif self.evaluation_metric == 'rmse':
            cv_score = np.sqrt(mean_squared_error(self.y_train, preds))
        elif self.evaluation_metric == 'auc':
            cv_score = roc_auc_score(self.y_train, preds)
        elif self.evaluation_metric == 'prauc':
            cv_score = average_precision_score(self.y_train, preds)

        logger.info(f'{self.run_name} - end training cv - score {cv_score}')

        # 予測結果の保存
        Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl')

        # mlflow
        self.run_id = mlflow.active_run().info.run_id
        log_param('model_name', str(self.model_cls).split('.')[-1][:-2])
        log_param('fe_name', self.fe_name)
        log_param('train_params', self.params)
        log_param('cv_strategy', str(self.cv))
        log_param('evaluation_metric', self.evaluation_metric)
        log_metric('cv_score', cv_score)
        log_param(
            'fold_scores',
            dict(
                zip([f'fold_{i}' for i in range(len(scores))],
                    [round(s, 4) for s in scores])))
        log_param('cols_definition', self.cols_definition)
        log_param('description', self.description)
        mlflow.end_run()
예제 #21
0
ython select_features.py --n 100
"""
import argparse

import pandas as pd

from ayniy.utils import Data

parser = argparse.ArgumentParser()
parser.add_argument('--n')
args = parser.parse_args()

fe_id = 'fe005'
run_id = 'run046'
N_FEATURES = int(args.n)
fe_name = f'fe005_top{N_FEATURES}'

X_train = Data.load(f'../input/X_train_{fe_id}.pkl')
y_train = Data.load(f'../input/y_train_{fe_id}.pkl')
X_test = Data.load(f'../input/X_test_{fe_id}.pkl')

fi = pd.read_csv(
    f'../output/importance/{run_id}-fi.csv')['Feature'][:N_FEATURES]

X_train = X_train[fi]
X_test = X_test[fi]

Data.dump(X_train, f'../input/X_train_{fe_name}.pkl')
Data.dump(y_train, f'../input/y_train_{fe_name}.pkl')
Data.dump(X_test, f'../input/X_test_{fe_name}.pkl')
예제 #22
0
import numpy as np
import pandas as pd

from ayniy.utils import Data

train = pd.read_csv('../input/train_data.csv')

y_train = train['y']
y_train = np.log1p(y_train)
Data.dump(y_train, '../input/y_train_fe000.pkl')