Example #1
0
    def objective(self, trial):
        # Extract optuna attribs from the input json
        optuna_trn_params = {}
        for key, val in self.params["trn_params"].items():
            if type(val) != list:
                optuna_trn_params[key] = val
            else:
                if type(val[0]) == float:
                    optuna_trn_params[key] = trial.suggest_uniform(
                        key, val[0], val[1])
                elif type(val[0]) == int:
                    optuna_trn_params[key] = trial.suggest_int(
                        key, val[0], val[1])
                else:
                    optuna_trn_params[key] = trial.suggest_categorical(
                        key, val)

        start = time.time()
        getLogger(get_version()).info(
            "\t [OPTUNA] {}th optimization starts".format(
                self.optimized_count))
        send_message("\t [OPTUNA] :sushi: {} th optimization starts".format(
            self.optimized_count))

        # Classify
        mtd_params = self.params["mtd_params"]
        clf = lgb.train(
            optuna_trn_params,
            self.train_dataset,
            mtd_params["num_boost_round"],
            valid_sets=[self.train_dataset, self.valid_dataset],
            feval=eval_auc,
            verbose_eval=mtd_params["verbose_eval"],
            early_stopping_rounds=mtd_params["early_stopping_rounds"])

        getLogger(get_version()).info("\t {}".format(clf.params))
        send_message("\t {}".format(clf.params))

        for train_or_valid, metrics in clf.best_score.items():
            for metric, score in metrics.items():
                getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(
                    train_or_valid, metric, score))
                send_message("\t\t :star-struck: Best {} {}: {}".format(
                    train_or_valid, metric, score))

        # Post-process this fold
        elapsed_time = int(time.time() - start)
        minutes, sec = divmod(elapsed_time, 60)
        hour, minutes = divmod(minutes, 60)
        getLogger(get_version()).info(
            "\t [OPTUNA] >> {}th optimization finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
            .format(self.optimized_count, hour, minutes, sec))
        send_message(
            "\t [OPTUNA] :sushi: {}th optimiaztion finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
            .format(self.optimized_count, hour, minutes, sec))
        self.optimized_count += 1

        return clf.best_score["valid_1"]["binary_logloss"]
def save_feature_importance(feature_importance, directory_path):
    feature_importance.set_index("feature", inplace=True)
    feature_importance["median"] = feature_importance.median(axis='columns')
    feature_importance.sort_values("median", ascending=False, inplace=True)
    Path.mkdir(directory_path, exist_ok=True, parents=True)
    feature_importance.to_csv(
        Path(directory_path / "{}.csv".format(get_version())))
Example #3
0
    def train(self, feature_names):
        self.feature_names = feature_names

        fold = 0
        valid = "valid{}".format(str(fold))
        trn_x = super().get_feature_df(self.feature_names, valid, "train")
        val_x = super().get_feature_df(self.feature_names, valid, "validate")
        trn_x.set_index("MachineIdentifier", inplace=True)
        val_x.set_index("MachineIdentifier", inplace=True)
        trn_y = trn_x["HasDetections"].astype(np.int8)
        val_y = val_x["HasDetections"].astype(np.int8)
        del trn_x["HasDetections"], val_x["HasDetections"]
        self.train_dataset = lgb.Dataset(trn_x, trn_y)
        self.valid_dataset = lgb.Dataset(val_x, val_y)
        self.optimized_count = 0

        study = optuna.create_study()
        study.optimize(self.objective, n_trials=self.params["n_trials"])
        best_trn_params = study.best_params
        getLogger(get_version()).info(
            "\t >> Best params: {}".format(best_trn_params))
        send_message("\t :youzyo: Best params: {}".format(best_trn_params))

        for key, val in self.params["trn_params"].items():
            if type(val) != list:
                best_trn_params[key] = val
        self.params["trn_params"] = best_trn_params
        del self.train_dataset, self.valid_dataset
        gc.collect()

        self.best_lgbm_classifier = LGBMClassifier(self.params,
                                                   self.dataset_name)
        return self.best_lgbm_classifier.train(feature_names)
Example #4
0
    def predict(self, feature_names):
        """
        Input:
            feature_names: directionary of features' names
        Output:
            predict_df: Dataframe(["MachineIdentifier", "HasDetections")
        """
        model_directory_path = Path(__file__).absolute(
        ).parents[2] / "data" / "model" / str(get_version())
        preds = None
        FOLDS = 5
        predict_df = None
        for fold in range(FOLDS):
            model_path = model_directory_path / "valid{}.model".format(fold)
            model = torch.load(str(model_path))
            valid = "valid{}".format(fold)
            test_df = super().get_feature_df(feature_names, valid, "test")
            if predict_df is None:
                predict_df = test_df["MachineIdentifier"]
            test_df = test_df.set_index("MachineIdentifier")
            normalized_test_df = (test_df - test_df.min()) / (test_df.max() -
                                                              test_df.min())

            if preds is None:
                preds = predict_with_model(self.params, normalized_test_df,
                                           model) / FOLDS
            else:
                preds += predict_with_model(self.params, normalized_test_df,
                                            model) / FOLDS

        predict_df = pd.DataFrame(predict_df)
        predict_df["HasDetections"] = preds
        return predict_df
    def predict(self, feature_names):
        """
        Input:
            feature_names: directionary of features' names
        Output:
            predict_df: Dataframe(["MachineIdentifier", "HasDetections")
        """
        model_directory_path = Path(__file__).absolute(
        ).parents[2] / "data" / "model" / str(get_version())
        preds = None
        FOLDS = 5
        predict_df = None
        for fold in range(FOLDS):
            model_path = model_directory_path / "valid{}.model".format(fold)
            clf = lgb.Booster(model_file=str(model_path))
            valid = "valid{}".format(fold)
            test_df = super().get_feature_df(feature_names, valid, "test")
            if predict_df is None:
                predict_df = test_df["MachineIdentifier"]
            test_df = test_df.set_index("MachineIdentifier")
            if preds is None:
                preds = predict_chunk(clf, test_df) / FOLDS
            else:
                preds += predict_chunk(clf, test_df) / FOLDS

        predict_df = pd.DataFrame(predict_df)
        predict_df["HasDetections"] = preds
        return predict_df
    def __save_outputs(cls, outputs, is_train):
        """
        is_train = True  -> Save ground truth and validity of train dataset
        is_train = False -> Save predictions of test dataset
        """
        if is_train:
            columns_order = ["MachineIdentifier", "HasDetections", "Predict"]
            save_path = cls.ROOT_PATH / "data" / "oof"
        else:
            columns_order = ["MachineIdentifier", "HasDetections"]
            save_path = cls.ROOT_PATH / "data" / "submit"
        Path.mkdir(save_path, exist_ok=True, parents=True)

        save_df = None
        for output in outputs:
            if save_df is None:
                save_df = output
            else:
                save_df = pd.concat([save_df, output])
        submission_df = pd.read_csv(cls.ROOT_PATH / "input" /
                                    "sample_submission.csv")
        del submission_df["HasDetections"]

        if is_train is False:
            predict_length = len(save_df)
            sub_length = len(submission_df)
            if predict_length != sub_length:
                getLogger(get_version()).info(
                    "CAUSION: Length of predict_df ({}) is NOT equal to that of submisson_df ({})"
                    .format(predict_length, sub_length))
                send_message(
                    ":ghost: CAUSION: Length of predict_df ({}) is NOT equal to that of submisson_df ({})"
                    .format(predict_length, sub_length))
            save_df = pd.merge(save_df,
                               submission_df,
                               on="MachineIdentifier",
                               how='right')
            save_df.fillna(0, inplace=True)

        save_df = save_df.sort_values("MachineIdentifier").reset_index(
            drop=True).loc[:, columns_order]
        filename = save_path / "{}.csv".format(get_version())
        save_df.to_csv(filename, index=False, float_format="%.6f")
        getLogger(get_version()).info("Output {}.".format(filename))
        send_message("Output {}".format(filename))

        return save_df
def model_exist():
    model_path = Path(__file__).absolute().parents[1] / "data" / "model" / str(
        get_version())
    for fold in range(5):
        if Path(model_path /
                "valid{}.model".format(fold)).exists() is not True:
            return False
    return True
def output_cv(validity, stamp):
    validity = validity.reset_index()
    columns_order = ["MachineIdentifier", "HasDetections", "Predict"]
    validity = validity.sort_values("MachineIdentifier").reset_index(
        drop=True).loc[:, columns_order]
    cv_auc = (fast_auc(validity["HasDetections"],
                       np.array(validity["Predict"])))
    getLogger(get_version()).info("\t >> CV Score (AUC):{}".format(cv_auc))
    send_message("\t {} CV Score (AUC):{}".format(stamp, cv_auc))
    return validity
    def get_feature_df(self, feature_names, valid_dir, part):
        """
        Ex)
        dataset_name    : min, ...
        feature_names   : __preprocess() return
        valid_dir       : "valid0", "valid1", ...
        part            : "train", "validate", "test"
        """
        feature_df = None
        feature_set_path = Path(__file__).absolute(
        ).parents[2] / "data" / "features" / self.dataset_name / valid_dir
        if dask_mode():
            print("Using dask.dataframe.read_csv()")

        for group, feature_list in feature_names.items():

            getLogger(get_version()).info(
                "\t \t \t Reading {}_{}.csv...".format(part, group))
            send_message("\t \t \t Reading {}_{}.csv...".format(part, group))
            if dask_mode():
                df = dd.read_csv(feature_set_path /
                                 "{}_{}.csv".format(part, group),
                                 usecols=["MachineIdentifier"] + feature_list)
                df = df.compute()
            else:
                df = pd.read_csv(feature_set_path /
                                 "{}_{}.csv".format(part, group),
                                 usecols=["MachineIdentifier"] + feature_list)

            if feature_df is None:
                feature_df = df
            else:
                feature_df = feature_df.merge(right=df,
                                              how="inner",
                                              on="MachineIdentifier")

        if part in ["train", "validate"]:
            if dask_mode():
                HasDetections = dd.read_csv(
                    Path(__file__).absolute().parents[2] / "input" /
                    "train.csv",
                    usecols=["MachineIdentifier", "HasDetections"])
                HasDetections = HasDetections.compute()
            else:
                HasDetections = pd.read_csv(
                    Path(__file__).absolute().parents[2] / "input" /
                    "train.csv",
                    usecols=["MachineIdentifier", "HasDetections"])
            feature_df = feature_df.merge(right=HasDetections,
                                          on="MachineIdentifier",
                                          how="inner")
        return feature_df
    def __print_log(cls, name, clf_type, dataset_path):
        version = get_version()

        text = "Classifier Type: {}".format(clf_type)
        getLogger(version).info(text)
        send_message(text)

        text = "Input Path: {}".format(dataset_path)
        getLogger(version).info(text)
        send_message(text)

        text = "--------------------------------"
        getLogger(version).info(text)
        send_message(text)
    def train(self, feature_names):
        self.feature_names = feature_names
        self.optimized_count = 0

        study = optuna.create_study()
        study.optimize(self.objective, n_trials=self.params["n_trials"])
        best_trn_params = study.best_params
        getLogger(get_version()).info("\t >> Best params: {}".format(best_trn_params))
        send_message("\t :youzyo: Best params: {}".format(best_trn_params))

        for key, val in self.params["trn_params"].items():
            if type(val) != list:
                best_trn_params[key] = val
        self.params["trn_params"] = best_trn_params

        self.best_lgbm_classifier = LGBMClassifier(self.params, self.dataset_name)
        return self.best_lgbm_classifier.train(feature_names)
    def process(cls, args):
        """
        main method
        """

        # Split the config file ({version}.json) into each variable
        config_file = cls.__load_config(args.version)
        with config_file.open() as f:
            params_dict = json.load(f)
        validities = []
        predicts = []
        trained_fully_before_processing = False
        for name, params in params_dict.items():
            feature_groups = params["Preprocess"]
            clf_type = params["Classifier"]
            clf_params = params["ClassifierParams"]
            dataset_path = cls.ROOT_PATH / params["DatasetPath"]
            ProcessorFactory.__print_log(name, clf_type, dataset_path)
            processor = Processor(feature_groups,
                                  ClassifierType.parseType(clf_type),
                                  clf_params, dataset_path)
            validity, predict = processor.process()
            if validity is None:
                trained_fully_before_processing = True
            else:
                validities.append(validity)
            predicts.append(predict)

        # Output total CV Score
        if trained_fully_before_processing is False and only_prediction(
        ) is False:
            valid_df = cls.__save_outputs(validities, is_train=True)
            total_cv = fast_auc(valid_df["HasDetections"],
                                np.array(valid_df["Predict"]))
            getLogger(get_version()).info(
                "\t >> Total CV Score (AUC): {}".format(total_cv))
            send_message(
                "\t :youzyo: Total CV Score (AUC): {}".format(total_cv))

        # Output submit file
        if need_prediction() or only_prediction():
            cls.__save_outputs(predicts, is_train=False)
    def train(self, feature_names):
        """
        FLow:
            1. Initialize parameters
            2. Process for each fold
                2.1 Generate dataset
                2.2 Classify
                2.3 calculate feature importances
            3. Output CV Score and features importances
            4. Predict training data (validate all data)
        Input:
            feature_names: directionary of features' names
        Output:
            validity: Dataframe(["MachineIdentifier", "HasDetections", "Predict")
        """

        # Initialize parameters
        mtd_params = self.params["mtd_params"]
        validity = None
        model_path = Path(__file__).absolute(
        ).parents[2] / "data" / "model" / str(get_version())
        Path.mkdir(model_path, exist_ok=True, parents=True)
        feature_importance = pd.DataFrame()
        START_FOLD = 0
        if get_back_training():
            START_FOLD = len(list(model_path.glob('**/*.model')))
        END_FOLD = 5
        if train_one_round():
            START_FOLD = 0
            END_FOLD = 1
        if START_FOLD == END_FOLD:
            return None
        # Process for each fold
        for fold in range(START_FOLD, END_FOLD):
            # Measure start time of the classification of this fold
            start = time.time()
            getLogger(get_version()).info("\t >> {} folds start".format(fold))
            send_message("\t :flashlight: {} folds start".format(fold))

            # Generate dataset
            getLogger(get_version()).info("\t \t Generating datasets...")
            send_message("\t \t Generating datasets...")
            valid = "valid{}".format(str(fold))
            trn_x = super().get_feature_df(feature_names, valid, "train")
            val_x = super().get_feature_df(feature_names, valid, "validate")
            trn_x.set_index("MachineIdentifier", inplace=True)
            val_x.set_index("MachineIdentifier", inplace=True)
            trn_y = trn_x["HasDetections"].astype(np.int8)
            val_y = val_x["HasDetections"].astype(np.int8)
            train_dataset = lgb.Dataset(trn_x, trn_y)
            valid_dataset = lgb.Dataset(val_x, val_y)
            getLogger(get_version()).info("\t \t Datasets were generated.")
            send_message("\t \t Datasets were generated.")

            # Initialize variables for scoring
            if validity is None:
                validity = pd.DataFrame()
                validity["HasDetections"] = pd.concat([trn_y, val_y])
                validity["Predict"] = 0

            # Delete needless features
            del trn_x["HasDetections"], val_x["HasDetections"]

            # Classify
            callbacks = [
                log_evaluation(get_training_logger(get_version()), fold)
            ]
            clf = lgb.train(
                self.params["trn_params"],
                train_dataset,
                mtd_params["num_boost_round"],
                valid_sets=[train_dataset, valid_dataset],
                feval=eval_auc,
                verbose_eval=mtd_params["verbose_eval"],
                early_stopping_rounds=mtd_params["early_stopping_rounds"],
                callbacks=callbacks)
            validity.loc[validity.index.isin(val_x.index),
                         "Predict"] = clf.predict(
                             val_x, num_iteration=clf.best_iteration)

            for train_or_valid, metrics in clf.best_score.items():
                for metric, score in metrics.items():
                    getLogger(get_version()).info(
                        "\t\t >> Best {} {}: {}".format(
                            train_or_valid, metric, score))
                    send_message("\t\t :star-struck: Best {} {}: {}".format(
                        train_or_valid, metric, score))

            # Calculate feature importance per fold
            if fold == 0:
                feature_importance["feature"] = trn_x.columns
            feature_importance["fold{}".format(fold)] = clf.feature_importance(
                importance_type="gain")

            # Measure finish time of the classification of this fold
            elapsed_time = int(time.time() - start)
            minutes, sec = divmod(elapsed_time, 60)
            hour, minutes = divmod(minutes, 60)
            getLogger(get_version()).info(
                "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))
            send_message(
                "\t :flashlight: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))

            # Post-process this fold
            del train_dataset, valid_dataset
            gc.collect()
            clf.save_model(str(model_path / "valid{}.model".format(fold)))

        # Output CV score
        validity = output_cv(validity, ":flashlight:")

        # Save importance
        directory_path = Path(__file__).absolute().parents[2] / "importance"
        save_feature_importance(feature_importance, directory_path)

        # Post-process the training
        del feature_importance
        gc.collect()

        return validity
    def objective(self, trial):
        # Extract optuna attribs from the input json
        optuna_trn_params = {}
        for key, val in self.params["trn_params"].items():
            if type(val) != list:
                optuna_trn_params[key] = val
            else:
                if type(val[0]) == float:
                    optuna_trn_params[key] = trial.suggest_uniform(key, val[0], val[1])
                elif type(val[0]) == int:
                    optuna_trn_params[key] = trial.suggest_int(key, val[0], val[1])
                else:
                    optuna_trn_params[key] = trial.suggest_categorical(key, val)

        # Initialize parameters
        mtd_params = self.params["mtd_params"]
        validity = None
        model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version())
        Path.mkdir(model_path, exist_ok=True, parents=True)
        START_FOLD = 0
        if get_back_training():
            START_FOLD = len(list(model_path.glob('**/*.model')))
        END_FOLD = 5
        if train_one_round():
            START_FOLD = 0
            END_FOLD = 1
        if START_FOLD == END_FOLD:
            return None

        start2 = time.time()
        getLogger(get_version()).info("\t [OPTUNA] {}th optimization starts".format(self.optimized_count))
        send_message("\t [OPTUNA] :sushi: {} th optimization starts".format(self.optimized_count))
        # Process for each fold
        for fold in range(START_FOLD, END_FOLD):
            start = time.time()
            getLogger(get_version()).info("\t [OPTUNA] >> {} folds start".format(fold))
            send_message("\t [OPTUNA] :sushi: {} folds start".format(fold))

            # Generate dataset
            valid = "valid{}".format(str(fold))
            trn_x = super().get_feature_df(self.feature_names, valid, "train")
            val_x = super().get_feature_df(self.feature_names, valid, "validate")
            trn_x.set_index("MachineIdentifier", inplace=True)
            val_x.set_index("MachineIdentifier", inplace=True)
            trn_y = trn_x["HasDetections"].astype(np.int8)
            val_y = val_x["HasDetections"].astype(np.int8)
            train_dataset = lgb.Dataset(trn_x, trn_y)
            valid_dataset = lgb.Dataset(val_x, val_y)

            # Initialize variables for scoring
            if validity is None:
                validity = pd.DataFrame()
                validity["HasDetections"] = pd.concat([trn_y, val_y])
                validity["Predict"] = 0

            # Delete needless features
            del trn_x["HasDetections"], val_x["HasDetections"]

            # Classify
            clf = lgb.train(optuna_trn_params,
                            train_dataset,
                            mtd_params["num_boost_round"],
                            valid_sets=[train_dataset, valid_dataset],
                            feval=eval_auc,
                            verbose_eval=mtd_params["verbose_eval"],
                            early_stopping_rounds=mtd_params["early_stopping_rounds"])
            validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict(val_x, num_iteration=clf.best_iteration)

            if fold == START_FOLD:
                getLogger(get_version()).info("\t {}".format(clf.params))
                send_message("\t {}".format(clf.params))

            for train_or_valid, metrics in clf.best_score.items():
                for metric, score in metrics.items():
                    getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score))
                    send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score))

            # Post-process this fold
            del train_dataset, valid_dataset
            gc.collect()
            elapsed_time = int(time.time() - start)
            minutes, sec = divmod(elapsed_time, 60)
            hour, minutes = divmod(minutes, 60)
            getLogger(get_version()).info(
                "\t [OPTUNA] >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))
            send_message("\t [OPTUNA] :sushi: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec))

        elapsed_time = int(time.time() - start2)
        minutes, sec = divmod(elapsed_time, 60)
        hour, minutes = divmod(minutes, 60)
        getLogger(get_version()).info(
            "\t [OPTUNA] >> {}th optimization finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
            .format(self.optimized_count, hour, minutes, sec))
        send_message("\t [OPTUNA] :sushi: {}th optimiaztion finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(self.optimized_count, hour, minutes, sec))
        self.optimized_count += 1

        # Output CV score
        validity = validity.reset_index()
        columns_order = ["MachineIdentifier", "HasDetections", "Predict"]
        validity = validity.sort_values("MachineIdentifier").reset_index(drop=True).loc[:, columns_order]
        cv_auc = (fast_auc(validity["HasDetections"], np.array(validity["Predict"])))

        return 1 - cv_auc
Example #15
0
    def train(self, feature_names):

        # Initialize parameters
        validity = None
        model_path = Path(__file__).absolute(
        ).parents[2] / "data" / "model" / str(get_version())
        Path.mkdir(model_path, exist_ok=True, parents=True)
        START_FOLD = 0
        if get_back_training():
            START_FOLD = len(list(model_path.glob('**/*.model')))
        END_FOLD = 5
        if train_one_round():
            START_FOLD = 0
            END_FOLD = 1
        if START_FOLD == END_FOLD:
            return None

        get_training_logger(get_version()).debug("fold \t iteration \
            \t train BCELoss \t valid BCELoss \
            \t train AUC \t valid AUC")

        # Process for each fold
        for fold in range(START_FOLD, END_FOLD):
            # Measure start time of the classification of this fold
            start = time.time()
            getLogger(get_version()).info("\t >> {} folds start".format(fold))
            send_message("\t :fire: {} folds start".format(fold))
            valid = "valid{}".format(str(fold))

            # Generate train data
            getLogger(get_version()).info("\t \t Generating datasets...")
            send_message("\t \t Generating datasets...")

            trn_x = super().get_feature_df(feature_names, valid, "train")
            trn_x.set_index("MachineIdentifier", inplace=True)
            trn_y = trn_x["HasDetections"].values.astype(np.float32)

            val_x = super().get_feature_df(feature_names, valid, "validate")
            val_x.set_index("MachineIdentifier", inplace=True)
            val_y = val_x["HasDetections"].values.astype(np.float32)

            # Initialize variables for scoring
            if validity is None:
                validity = pd.DataFrame()
                validity["HasDetections"] = pd.concat([
                    trn_x["HasDetections"].astype(np.float32),
                    val_x["HasDetections"].astype(np.float32)
                ])
                validity["Predict"] = 0

            del trn_x["HasDetections"], val_x["HasDetections"]
            trn_x.fillna(0, inplace=True)
            val_x.fillna(0, inplace=True)
            normalized_trn_x = (trn_x - trn_x.min()) / (trn_x.max() -
                                                        trn_x.min())
            normalized_trn_x.fillna(normalized_trn_x.mean(), inplace=True)
            normalized_val_x = (val_x - val_x.min()) / (val_x.max() -
                                                        val_x.min())
            normalized_val_x.fillna(normalized_val_x.mean(), inplace=True)

            train_loader, valid_loader, dataset_sizes = create_tensor_dataloader(
                self.params, normalized_trn_x, trn_y, normalized_val_x, val_y)
            data_loaders = {"train": train_loader, "valid": valid_loader}

            getLogger(get_version()).info("\t \t Datasets were generated.")
            send_message("\t \t Datasets were generated.")

            # Define the Network
            num_epochs = self.params["num_epochs"]
            network_class = getattr(
                import_module("classifier.pytorch_network." +
                              self.params["network"]), self.params["network"])
            network = network_class(len(trn_x.columns),
                                    self.params["network_params"])
            criterion = nn.BCELoss()  # Don't change the criterion function!
            optimizer_class = getattr(import_module('torch.optim'),
                                      self.params["optimizer"])
            optimizer = optimizer_class(network.parameters(),
                                        **self.params["optimizer_params"])
            scheduler_class = getattr(
                import_module('torch.optim.lr_scheduler'),
                self.params["scheduler"])
            scheduler = scheduler_class(optimizer,
                                        **self.params["scheduler_params"])
            model = train_model(
                fold,
                data_loaders,
                dataset_sizes,
                network,
                criterion,
                optimizer,
                scheduler,
                num_epochs=num_epochs,
                early_stopping_rounds=self.params["early_stopping_rounds"],
                verbose=self.params["verbose"])

            # Classify
            validity.loc[validity.index.isin(val_x.index),
                         "Predict"] = predict_with_model(
                             self.params, normalized_val_x, model)

            # Measure finish time of the classification of this fold
            elapsed_time = int(time.time() - start)
            minutes, sec = divmod(elapsed_time, 60)
            hour, minutes = divmod(minutes, 60)
            getLogger(get_version()).info(
                "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))
            send_message(
                "\t :fire: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))

            # Post-process this fold
            gc.collect()
            torch.save(model, str(model_path / "valid{}.model".format(fold)))

        # Output CV score
        validity = validity.reset_index()
        columns_order = ["MachineIdentifier", "HasDetections", "Predict"]
        validity = validity.sort_values("MachineIdentifier").reset_index(
            drop=True).loc[:, columns_order]
        cv_auc = (fast_auc(validity["HasDetections"],
                           np.array(validity["Predict"])))
        getLogger(get_version()).info("\t >> CV Score (AUC):{}".format(cv_auc))
        send_message("\t :fire: CV Score (AUC):{}".format(cv_auc))

        # Post-process the training
        gc.collect()

        return validity
from save_log import create_main_logger, create_train_logger, get_version, send_message
from processor_factory import ProcessorFactory
from exceptions import *
warnings.filterwarnings('ignore')


def main(args):
    send_message(
        ":thinking_face: ============= {} ============= :thinking_face:".
        format(str(datetime.now())))
    ProcessorFactory.process(args)


if __name__ == "__main__":
    gc.enable()
    version = get_version()
    create_main_logger(version)
    create_train_logger(version)
    try:
        main(get_option())
    except DuplicateVersionException:
        send_message(":stop: Duplicate Version Exception Occurred.")
        getLogger(version).exception("Duplicate Version Exception Occurred.")
    except IrregularArgumentException:
        send_message(":stop: Irregular Argument for Feature Extraction.")
        getLogger(version).exception(
            "Irregular Argument for Feature Extraction.")
    except IrregularCalcBackException:
        send_message(":stop: Irregular Dataframe back.")
        getLogger(version).exception("Irregular Dataframe back.")
    except Exception:
Example #17
0
def train_model(fold, data_loaders, dataset_sizes, model,
                criterion, optimizer, scheduler,
                num_epochs, early_stopping_rounds=10, verbose=-1):
    since = time.time()
    best_score = {
        "train BCELoss": 1.0,
        "train AUC": 0.5,
        "valid BCELoss": 1.0,
        "valid AUC": 0.5}
    best_model_wts = copy.deepcopy(model.state_dict())
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    not_improve_round = 0
    for epoch in range(num_epochs):
        bce_dictionary = {"train": 0.0, "valid": 0.0}
        auc_dictionary = {"train": 0.0, "valid": 0.0}
        for phase in ["train", "valid"]:
            if phase == "train":
                scheduler.step()
                model.train()
            else:
                model.eval()

            running_bce = 0.0
            running_auc = 0.0

            # Iteration
            for inputs, labels in data_loaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                # forward
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    # backward + optimize
                    if phase == "train":
                        loss.backward()
                        optimizer.step()

                # statistics
                running_bce += loss.item() * inputs.size(0)
                if torch.cuda.is_available():
                    labels = labels.cpu()
                    outputs = outputs.cpu()
                y_true = labels.numpy()
                y_pred = list(chain.from_iterable(outputs.detach().numpy()))
                running_auc += fast_auc(y_true, y_pred) * inputs.size(0)
            bce_dictionary[phase] = running_bce / dataset_sizes[phase]
            auc_dictionary[phase] = running_auc / dataset_sizes[phase]

        # Update model
        if best_score["train BCELoss"] > bce_dictionary["train"]:
            best_score["train BCELoss"] = bce_dictionary["train"]
            best_score["valid BCELoss"] = bce_dictionary["valid"]
            best_score["train AUC"] = auc_dictionary["train"]
            best_score["valid AUC"] = auc_dictionary["valid"]
            best_model_wts = copy.deepcopy(model.state_dict())
            not_improve_round = 0
        else:
            not_improve_round += 1

        if not_improve_round >= early_stopping_rounds:
            getLogger(get_version()).info(
                "\t \t Epoch {}/{}: Early stopping".format(epoch, num_epochs))
            send_message(
                "\t \t :upura: Epoch {}/{}: Early stopping".format(epoch, num_epochs))
            break

        if epoch % verbose == 0 and verbose != -1:
            getLogger(get_version()).info("{}\t{}\t{}\t{}\t{}\t{}".format(
                fold, epoch, bce_dictionary["train"], bce_dictionary["valid"],
                auc_dictionary["train"], auc_dictionary["valid"]))
            send_message("{}\t{}\t{}\t{}\t{}\t{}".format(
                         fold, epoch, bce_dictionary["train"], bce_dictionary["valid"],
                         auc_dictionary["train"], auc_dictionary["valid"]))

        get_training_logger(get_version()).debug("{}\t{}\t{}\t{}\t{}\t{}".format(
            fold, epoch, bce_dictionary["train"], bce_dictionary["valid"],
            auc_dictionary["train"], auc_dictionary["valid"]))

    time_elapsed = time.time() - since
    getLogger(get_version()).info(
        "\t \t Training complete in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60))
    send_message(
        "\t \t Training complete in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60))

    getLogger(get_version()).info(
        "\t \t [Best train scores] BCELoss: {:.4f}, AUC: {:.4f}".format(
            best_score["train BCELoss"], best_score["train AUC"]))
    getLogger(get_version()).info(
        "\t \t [Valid scores when train is the best] BCELoss: {:.4f}, AUC: {:.4f}".format(
            best_score["valid BCELoss"], best_score["valid AUC"]))

    send_message("\t \t :star-struck: Best train BCELoss: {:.4f}".format(best_score["train BCELoss"]))
    send_message("\t \t :star-struck: Best train AUC: {:.4f}".format(best_score["train AUC"]))
    send_message("\t \t :star-struck: Best valid BCELoss: {:.4f}".format(best_score["valid BCELoss"]))
    send_message("\t \t :star-struck: Best valid AUC: {:.4f}".format(best_score["valid AUC"]))

    model.load_state_dict(best_model_wts)
    return model
    def train(self, feature_names):
        """
        Input:
            feature_names: directionary of features' names
        Output:
            validity: Dataframe(["MachineIdentifier", "HasDetections", "Predict")
        """
        # Initialize parameters
        validity = None
        model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version())
        Path.mkdir(model_path, exist_ok=True, parents=True)
        feature_importance = pd.DataFrame()
        START_FOLD = 0
        if get_back_training():
            START_FOLD = len(list(model_path.glob('**/*.model')))
        END_FOLD = 5
        if train_one_round():
            START_FOLD = 0
            END_FOLD = 1
        if START_FOLD == END_FOLD:
            return None

        # Process for each fold
        for fold in range(START_FOLD, END_FOLD):
            log_path = Path(__file__).absolute().parents[2] / "log" / "train" / str(get_version()) / str("fold{}".format(fold))
            Path.mkdir(log_path, exist_ok=True, parents=True)

            # Measure start time of the classification of this fold
            start = time.time()
            getLogger(get_version()).info("\t >> {} folds start".format(fold))
            send_message("\t :cat: {} folds start".format(fold))

            # Generate dataset
            getLogger(get_version()).info("\t \t Generating datasets...")
            send_message("\t \t Generating datasets...")
            valid = "valid{}".format(str(fold))
            trn_x = super().get_feature_df(feature_names, valid, "train")
            val_x = super().get_feature_df(feature_names, valid, "validate")
            trn_x.set_index("MachineIdentifier", inplace=True)
            val_x.set_index("MachineIdentifier", inplace=True)
            trn_y = trn_x["HasDetections"].astype(np.int8)
            val_y = val_x["HasDetections"].astype(np.int8)
            getLogger(get_version()).info("\t \t Datasets were generated.")
            send_message("\t \t Datasets were generated.")

            # Initialize variables for scoring
            if validity is None:
                validity = pd.DataFrame()
                validity["HasDetections"] = pd.concat([trn_y, val_y])
                validity["Predict"] = 0

            # Delete needless features
            del trn_x["HasDetections"], val_x["HasDetections"]

            # Classify
            clf = CatBoostClassifier(iterations=self.params["iterations"],
                                     verbose=self.params["verbose"],
                                     early_stopping_rounds=self.params["early_stopping_rounds"],
                                     random_seed=self.params["random_seed"],
                                     max_depth=self.params["max_depth"],
                                     loss_function=self.params["loss_function"],
                                     custom_metric=self.params["custom_metric"],
                                     eval_metric=self.params["eval_metric"],
                                     rsm=self.params["rsm"],
                                     train_dir=str(log_path))
            clf.fit(trn_x.values, trn_y.values,
                    eval_set=(val_x.values, val_y.values))

            for train_or_valid, metrics in clf.best_score_.items():
                for metric, score in metrics.items():
                    getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score))
                    send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score))
            validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict_proba(val_x.values)[:, 1]

            # Calculate feature importance per fold
            if fold == 0:
                feature_importance["feature"] = trn_x.columns
            feature_importance["fold{}".format(fold)] = clf.get_feature_importance()

            # Measure finish time of the classification of this fold
            elapsed_time = int(time.time() - start)
            minutes, sec = divmod(elapsed_time, 60)
            hour, minutes = divmod(minutes, 60)
            getLogger(get_version()).info(
                "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))
            send_message("\t :cat: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec))

            # Post-process this fold
            clf.save_model(str(model_path / "valid{}.model".format(fold)))

        # Output CV score
        validity = output_cv(validity, ":cat:")

        # Save importance
        directory_path = Path(__file__).absolute().parents[2] / "importance"
        save_feature_importance(feature_importance, directory_path)

        # Post-process the training
        del feature_importance
        gc.collect()

        return validity