Beispiel #1
0
    def __init__(self, train_df, target_column, index_column, cat_features,
                 eval_metric, metrics_scorer, metrics_decimals, num_folds,
                 stratified, kfolds_shuffle, int_threshold, seed_val,
                 project_location, output_dirname):

        # Input data
        self.train_df = train_df  # type: pd.DataFrame
        self.target_column = target_column  # type: str
        self.index_column = index_column  # type: dict
        self.cat_features = cat_features  # type: list
        self.int_threshold = int_threshold  # type: int

        # Settings for CV
        self.num_folds = num_folds  # type: int
        self.eval_metric = eval_metric  # type: str
        self.metrics_scorer = metrics_scorer  # type: metrics
        self.stratified = stratified  # type: bool
        self.kfolds_shuffle = kfolds_shuffle  # type: bool
        self.metrics_decimals = metrics_decimals  # type: int
        self.seed_val = seed_val  # type: int
        self.path_output_dir = os.path.normpath(
            os.path.join(project_location, output_dirname))
        create_output_dir(self.path_output_dir)

        self._verify_input_data_is_correct()
        np.random.seed(seed_val)  # seed the numpy random generator
Beispiel #2
0
    def __init__(self,
                 predictor,
                 seed_val,
                 project_location,
                 output_dirname,
                 filename='optim_hp'):
        """
        This is a base class for optimization of model's hyperparameters. Methods of this class can be reused in
        derived classes (as, for instance, in BayesHyperParamsOptimization). These methods allows adjusting of data
        types of the hyperparameters, auto-complete missing parameters, save / read parameters from the disk.
        :param predictor: instance of Predictor class.
        :param seed_val: seed numpy random generator
        :param output_dirname: name of directory to save results of hyper parameters optimization
        :param filename: name of hyperparameter optimizer (is used when saving results of optimization)
        """
        self.predictor = predictor  # type: Predictor
        self.seed_val = seed_val  # type: int
        self.filename = filename  # type: str
        self.path_output_dir = os.path.normpath(
            os.path.join(project_location, output_dirname))
        create_output_dir(self.path_output_dir)

        self.best_params = None  # type: dict
        self.best_score = None  # type: float
        self.hpo_cv_df = None  # type: pd.DataFrame

        np.random.seed(seed_val)  # seed the numpy random generator
Beispiel #3
0
    def run(self):
        oof_input_files = {}
        for input_target in self.input():
            with open(input_target.path, 'r') as f:
                oof_input_files = merge_two_dicts(oof_input_files, json.load(f))

        # TODO: to refactor this part
        create_output_dir(os.path.join(self.project_location, 'results_ensembling'))
        full_path_to_file = os.path.join(self.project_location, 'results_ensembling', self.output_filename)
        _logger.info('Saving %s' % full_path_to_file)
        with open(full_path_to_file, 'w') as f:
            f.write(json.dumps(oof_input_files, indent=4))
Beispiel #4
0
    def __init__(self, oof_input_files, blend_bagged_results, train_df, test_df, target_column, index_column,
                 metrics_scorer, metrics_decimals=6, target_decimals=6, project_location='', output_dirname=''):
        """
        This is a base class for blending models prediction. The blender is trained on out-of-fold predictions (OOF)
        of the 1st (or 2nd) level models and applied to test submissions. The blender is optimized in a way to maximize
        evaluation metrics.
        :param oof_input_files: dict with locations and names of train and test OOF data sets (to be used in blending)
        :param blend_bagged_results: if True and -> blender will use raw OOF predictions obtained for each seed by the
                                     selected models (and not the mean prediction over all seeds per model)
        :param train_df: pandas DF with train data set
        :param test_df: pandas DF with test data set
        :param target_column: target column (to be predicted)
        :param index_column: unique index column
        :param metrics_scorer: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
        :param metrics_decimals: round precision (decimals) of the metrics (e.g. used in printouts)
        :param target_decimals: round precision (decimals) of the target column
        :param project_location: path to the project
        :param output_dirname: name of directory where to save results of blending procedure
        """

        # Input data
        self.target_column = target_column
        self.index_column = index_column
        self.metrics_scorer = metrics_scorer
        self.metrics_decimals = metrics_decimals
        self.target_decimals = target_decimals

        # Load OOF data (both train and test)
        self.ensembler = Ensembler()
        self.train_oof, self.test_oof = \
            self.ensembler.load_oof_target_and_test_data(oof_input_files, blend_bagged_results, train_df, test_df,
                                                         target_column, index_column, target_decimals, project_location)
        # Blending results
        self.oof_preds = None  # type: pd.DataFrame
        self.sub_preds = None  # type: pd.DataFrame
        self.cv_results = None  # type: pd.DataFrame
        self.cv_score = None  # type: float
        self.cv_std = None  # type: float

        # Full path to solution directory
        self.path_output_dir = os.path.normpath(os.path.join(project_location, output_dirname))
        create_output_dir(self.path_output_dir)