Ejemplo n.º 1
0
 def __init__(self, bst_path, model_tag):
     """
     初始化
     Args:
         bst_path: 通过model.save()保存的地址
     """
     self.model = Booster(model_file=bst_path)
     self.model_tag = model_tag
Ejemplo n.º 2
0
    def _load(properties):
        """Load a LGBMExplainableModel from the given properties.

        :param properties: A serialized dictionary representation of the LGBMExplainableModel.
        :type properties: dict
        :return: The deserialized LGBMExplainableModel.
        :rtype: azureml.explain.model.mimic.models.LGBMExplainableModel
        """
        # create the LGBMExplainableModel without any properties using the __new__ function, similar to pickle
        lightgbm = LGBMExplainableModel.__new__(LGBMExplainableModel)
        # Get _n_features
        _n_features = properties.pop(_N_FEATURES)
        # If classification case get _n_classes
        if json.loads(properties[LightGBMSerializationConstants.MULTICLASS]):
            _n_classes = properties.pop(_N_CLASSES)
        # load all of the properties
        for key, value in properties.items():
            # Regenerate the properties on the fly
            if key in LightGBMSerializationConstants.nonify_properties:
                if key == LightGBMSerializationConstants.LOGGER:
                    parent = logging.getLogger(__name__)
                    lightgbm_identity = json.loads(properties[LightGBMSerializationConstants.IDENTITY])
                    lightgbm.__dict__[key] = parent.getChild(lightgbm_identity)
                elif key == LightGBMSerializationConstants.TREE_EXPLAINER:
                    lightgbm.__dict__[key] = None
                else:
                    raise Exception("Unknown nonify key on deserialize in LightGBMExplainableModel: {}".format(key))
            elif key in LightGBMSerializationConstants.save_properties:
                # Load the booster from file and re-create the LGBMClassifier or LGBMRegressor
                # This is not recommended but can be necessary to get around pickle being not secure
                # See here for more info:
                # https://github.com/Microsoft/LightGBM/issues/1942
                # https://github.com/Microsoft/LightGBM/issues/1217
                if json.loads(properties[LightGBMSerializationConstants.MULTICLASS]):
                    new_lgbm = LGBMClassifier()
                    lgbm_booster = Booster(params={LightGBMSerializationConstants.MODEL_STR: value})
                    new_lgbm._Booster = lgbm_booster
                    new_lgbm._n_classes = _n_classes
                else:
                    new_lgbm = LGBMRegressor()
                    lgbm_booster = Booster(params={LightGBMSerializationConstants.MODEL_STR: value})
                    new_lgbm._Booster = lgbm_booster
                new_lgbm._n_features = _n_features
                lightgbm.__dict__[key] = new_lgbm
            elif key in LightGBMSerializationConstants.enum_properties:
                # NOTE: If more enums added in future, will need to handle this differently
                lightgbm.__dict__[key] = ShapValuesOutput(json.loads(value))
            else:
                lightgbm.__dict__[key] = json.loads(value)
        return lightgbm
Ejemplo n.º 3
0
def test_quant_data_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None):
    """
    input:(file_names,model)
    output: mean rank rate
    """
    mean_rank_rates = []
    file_number_list = []
    if predict_iteration:
        model.save_model('tmp_model.txt', num_iteration=predict_iteration)
        model = Booster(model_file='tmp_model.txt')
    for i in input_file_numbers:
        data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT)
        if normalize:
            fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i)
        else:
            fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i)
        try:
            with open(fin_path, 'rb') as fin_data_file:
                stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file)
                data_process_logger.info('testing file: %s' % fin_path)
                input_datas = np.column_stack((stock_ids, stock_scores, vec_values))
                mean_rank_rate = test_datas(input_datas, model)
                if mean_rank_rate >= 0.4:
                    data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas)))
                mean_rank_rates.append(mean_rank_rate)
                file_number_list.append(i)
        except Exception, e:
            data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e))
Ejemplo n.º 4
0
    def multi_classifier(self):
        """An instance of pre-trained multi-temporal cloud classifier. Loaded only the first time it is required."""
        if self._multi_classifier is None:
            path = os.path.join(self.MODELS_FOLDER, self.MULTI_CLASSIFIER_NAME)
            self._multi_classifier = Booster(model_file=path)

        return self._multi_classifier
Ejemplo n.º 5
0
def load_lgbm_model(fname):
    """ Load a LightGBM model that was saved as a file with
        the HyperLGBMClassifier.save method.
        
        The model is span on two files:
            
            * The first file contains the model saved with the Booster class,
            this file have no extension.
            
            * The second file contains the parameters used to create the model,
            this file have the extension '.p'.
            
        Parameters
        ----------
        fname : path 
                The file name without extension.
        """
    from lightgbm import Booster
    params = pickle.load(open(fname + '.p', "rb"))
    n_features = params['meta']['n_features']
    n_classes = params['meta']['n_classes']
    param_map = params['param_map']
    model = HyperLGBMClassifier(**param_map)
    model.set_n_labels(n_classes - 1)
    y = [i for i in range(n_classes)]
    model.set_le(y)
    model.set_n_features_(n_features)
    model._Booster = Booster(model_file=fname)
    return model
Ejemplo n.º 6
0
def compute_importances(importances: pd.DataFrame, columns: List[str],
                        model: lgb.Booster, fold: int) -> pd.DataFrame:
    imp_df = pd.DataFrame()
    imp_df['feature'] = columns
    imp_df['gain'] = model.feature_importance('gain')
    imp_df['fold'] = fold + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    return importances
    def classifier(self):
        """
        Provides a classifier object. It also loads it if it hasn't been loaded yet. This way the classifier is loaded
        only when it is actually required.
        """
        if self._classifier is None:
            self._classifier = PixelClassifier(Booster(model_file=self.model_filename))

        return self._classifier
Ejemplo n.º 8
0
def _get_importance(model: lgb.Booster, features: List[str],) -> pd.DataFrame:
    df = pd.DataFrame()

    df["feature"] = features
    df["importance"] = model.feature_importance(
        importance_type="gain", iteration=model.best_iteration
    )

    return df
Ejemplo n.º 9
0
def predict(X_test: pd.DataFrame, y_test, gbm: lgb.Booster):
    # predict
    pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    y_pred = []

    for x in pred:
        y_pred.append(np.argmax(x))

    # Print the precision and recall, among other metrics
    print(
        metrics.classification_report(y_test, y_pred, target_names=Categories))
Ejemplo n.º 10
0
def predict(gbm: lgb.Booster, test_data: pd.DataFrame, full_data: pd.DataFrame, feature_names: List[str]):
    last_friday = datetime.now() + relativedelta(weekday=FR(-1))
    date_string = last_friday.strftime('%Y-%m-%d')
    print(date_string)
    live_data = full_data.loc[date_string].copy()
    live_data.dropna(subset=feature_names, inplace=True)
    live_data[PREDICTION_NAME] = gbm.predict(live_data[feature_names])
    test_data[PREDICTION_NAME] = gbm.predict(test_data[feature_names])
    return dict(
        predicted_live_data=live_data,
        predicted_test_data=test_data
    )
Ejemplo n.º 11
0
    def from_model(
        cls,
        booster: lightgbm.Booster,
        *,
        path: os.PathLike,
        preprocessor: Optional["Preprocessor"] = None,
    ) -> "LightGBMCheckpoint":
        """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores a LightGBM
        model.

        Args:
            booster: The LightGBM model to store in the checkpoint.
            path: The directory where the checkpoint will be stored.
            preprocessor: A fitted preprocessor to be applied before inference.

        Returns:
            An :py:class:`LightGBMCheckpoint` containing the specified ``Estimator``.

        Examples:
            >>> from ray.train.lightgbm import LightGBMCheckpoint
            >>> import lightgbm
            >>>
            >>> booster = lightgbm.Booster()  # doctest: +SKIP
            >>> checkpoint = LightGBMCheckpoint.from_model(booster, path=".")  # doctest: +SKIP # noqa: #501

            You can use a :py:class:`LightGBMCheckpoint` to create an
            :py:class:`~ray.train.lightgbm.LightGBMPredictor` and preform inference.

            >>> from ray.train.lightgbm import LightGBMPredictor
            >>>
            >>> predictor = LightGBMPredictor.from_checkpoint(checkpoint)  # doctest: +SKIP # noqa: #501
        """
        booster.save_model(os.path.join(path, MODEL_KEY))

        if preprocessor:
            save_preprocessor_to_dir(preprocessor, path)

        checkpoint = cls.from_directory(path)

        return checkpoint
Ejemplo n.º 12
0
class LightgbmOperator(object):
    def __init__(self, bst_path, model_tag):
        """
        初始化
        Args:
            bst_path: 通过model.save()保存的地址
        """
        self.model = Booster(model_file=bst_path)
        self.model_tag = model_tag

    def predict(self, input_datas):
        # if not isinstance(input_datas,list) and not isinstance(input_datas,np.array):
        return self.model.predict(input_datas)
Ejemplo n.º 13
0
def to_air_checkpoint(
    path: str,
    booster: lightgbm.Booster,
    preprocessor: Optional["Preprocessor"] = None,
) -> Checkpoint:
    """Convert a pretrained model to AIR checkpoint for serve or inference.

    Args:
        path: The directory path where model and preprocessor steps are stored to.
        booster: A pretrained lightgbm model.
        preprocessor: A fitted preprocessor. The preprocessing logic will
            be applied to serve/inference.
    Returns:
        A Ray Air checkpoint.
    """
    booster.save_model(os.path.join(path, MODEL_KEY))

    if preprocessor:
        save_preprocessor_to_dir(preprocessor, path)

    checkpoint = Checkpoint.from_directory(path)

    return checkpoint
Ejemplo n.º 14
0
    def predict_single_fold(self, model: lgb.Booster,
                            dataset: TabularDataset) -> np.ndarray:
        """Predict target values for dataset.

        Args:
            model: Lightgbm object.
            dataset: test dataset.

        Return:
            predicted target values.

        """
        pred = self.task.losses['lgb'].bw_func(model.predict(dataset.data))

        return pred
Ejemplo n.º 15
0
    def __init__(self, model_config_dict: dict, threads: int = 1):
        """Initialise the tree model variables used in the application of RainForests
        Calibration. LightGBM Boosters are used for tree model predictors.

        Args:
            model_config_dict:
                Dictionary containing Rainforests model configuration variables.
            threads:
                Number of threads to use during prediction with tree-model objects.

        Dictionary is of format::

            {
                "-50.0" : {
                    "lightgbm_model" : "<path_to_lightgbm_model_object>"
                },
                "-25.0" : {
                    "lightgbm_model" : "<path_to_lightgbm_model_object>"
                },
                ...,
                "50.0" : {
                    "lightgbm_model" : "<path_to_lightgbm_model_object>"
                }
            }

        The keys specify the error threshold value, while the associated values
        are the path to the corresponding tree-model objects for that threshold.
        """
        from lightgbm import Booster

        # Dictionary keys represent error thresholds, however may be strings as they
        # are sourced from json files. In order use these in processing, and to sort
        # them in a sensible fashion, we shall cast the key values as float32.
        sorted_model_config_dict = OrderedDict(
            sorted({np.float32(k): v
                    for k, v in model_config_dict.items()}.items()))

        self.error_thresholds = np.array([*sorted_model_config_dict.keys()])

        lightgbm_model_filenames = [
            Path(threshold_dict.get("lightgbm_model")).expanduser()
            for threshold_dict in sorted_model_config_dict.values()
        ]
        self.tree_models = [
            Booster(model_file=str(file)).reset_parameter(
                {"num_threads": threads}) for file in lightgbm_model_filenames
        ]
def get_surrogate_booster_pyspark(filtered_df, analyzer, max_depth, num_leaves,
                                  min_child_samples):
    """Get surrogate booster for pyspark dataframe.

    Creates the surrogate model trained on errors and returns the booster.

    :param filtered_df: The filtered dataframe.
    :type filtered_df: pyspark.sql.DataFrame
    :param analyzer: The error analyzer containing the categorical
        features and categories for the full dataset.
    :type analyzer: BaseAnalyzer
    :param max_depth: The maximum depth of the surrogate tree trained
        on errors.
    :type max_depth: int
    :param num_leaves: The number of leaves of the surrogate tree
        trained on errors.
    :type num_leaves: int
    :param min_child_samples: The minimal number of data required to
        create one leaf.
    :type min_child_samples: int
    :return: The extracted booster from the surrogate model and the
        scored dataset.
    :rtype: (Booster, pyspark.sql.DataFrame)
    """
    # compute the pred_y column
    scored_data = analyzer.model.transform(filtered_df.to_spark())
    diff_data = scored_data.withColumn(
        DIFF,
        F.when(F.col(analyzer.true_y) != F.col(PREDICTION), 1).otherwise(0))
    if analyzer.model_task == ModelTask.CLASSIFICATION:
        diff_data = diff_data.drop(PREDICTION, RAW_PREDICTION, PROBABILITY)
    else:
        diff_data = diff_data.drop(PREDICTION)
    model = create_surrogate_model_pyspark(analyzer, diff_data, max_depth,
                                           num_leaves, min_child_samples)
    # TODO: update lightgbm in pyspark to get around file requirement
    model_path = "./models/lgbmclassifier.model"
    model.saveNativeModel(model_path)
    model_file = glob.glob(model_path + '/*.txt')[0]
    with open(model_file) as f:
        contents = f.read()
    booster_args = {'objective': analyzer.model_task}

    lgbm_booster = Booster(params=booster_args, model_str=contents)
    return lgbm_booster, diff_data.to_koalas()
Ejemplo n.º 17
0
def predict(
    cv_num: int, sp: Split, model: lgb.Booster, model_number: Optional[int] = None
) -> pd.DataFrame:
    config = Config()
    d_start: int = config.CV_START_DAYS[cv_num]
    d_end: int = config.CV_START_DAYS[cv_num] + 28
    test_pred = sp.test.copy()
    test_pred[config.TARGET + "_true"] = test_pred[config.TARGET]

    test_pred.loc[test_pred.d >= d_start, config.TARGET] = np.nan
    for d in tqdm(range(d_start, d_end)):
        test_pred = make_rolling_for_test(test_pred, d, config.features)
        test_pred.loc[test_pred.d == d, config.TARGET] = model.predict(
            test_pred.loc[test_pred.d == d, config.features]
        )
        test_pred.loc[test_pred.d == d, "sales_is_zero"] = (
            test_pred.loc[test_pred.d == d, "sales"] == 0
        ).astype(np.int8)

    return test_pred
Ejemplo n.º 18
0
def parallel_test_quant_data_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None,
                                     process_count=2):
    """
    input:(file_names,model)
    output: mean rank rate
    """
    mean_rank_rates = []
    file_number_list = []
    if predict_iteration:
        model.save_model('tmp_model.txt', num_iteration=predict_iteration)
    else:
        model.save_model('tmp_model.txt')
    global g_model
    g_model = Booster(model_file='tmp_model.txt')
    proc_pool = multiprocessing.Pool(process_count)
    multi_result = []
    for i in input_file_numbers:
        data_root_path = '%s/datas/Quant-Datas-2.0' % (DATA_ROOT)
        if normalize:
            fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i)
        else:
            fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i)
        data_res = proc_pool.apply_async(test_single_file, args=(fin_path,))
        multi_result.append(data_res)
    proc_pool.close()
    proc_pool.join()
    # 合并结果
    for i in range(len(multi_result)):
        tmp_mean_rank_rate, file_n = multi_result[i].get()
        mean_rank_rates.append(tmp_mean_rank_rate)
        file_number_list.append(file_n)
    mean_rank_rate = np.mean(mean_rank_rates)
    std_rank_rate = np.std(mean_rank_rates)
    var_rank = np.var(mean_rank_rates)
    data_process_logger.info(
        'Tested %s files, all input files mean rank rate is %s, all input files std is %s, var is %s' % (
            len(input_file_numbers), mean_rank_rate, std_rank_rate, var_rank))
    return file_number_list, mean_rank_rates
Ejemplo n.º 19
0
def pipeline_test_lambdarank_wrapper(input_file_numbers,
                                     model,
                                     normalize=True,
                                     predict_iteration=None):
    """
    进行结果测试
    Args:
        input_file_numbers:
        model:
        normalize:
        predict_iteration:

    Returns:

    """
    mean_rank_rates = []
    file_number_list = []
    if predict_iteration:
        model.save_model('tmp_lambdarank_model.txt',
                         num_iteration=predict_iteration)
        model = Booster(model_file='tmp_lambdarank_model.txt')
    for i in input_file_numbers:
        data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT)
        if normalize:
            fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (
                data_root_path, i)
        else:
            fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i)
        try:
            mean_rank_rate = test_single_lambdarank_file(fin_path, model)
            if mean_rank_rate:
                mean_rank_rates.append(mean_rank_rate)
                file_number_list.append(i)
        except Exception, e:
            data_process_logger.info(
                'test file failed: file path=%s, details=%s' % (fin_path, e))
Ejemplo n.º 20
0
def save_lgb(model: lgb.Booster, path):
    model_str = model.model_to_string()
    f = open(path, 'w')
    f.write(model_str)
    f.close()
Ejemplo n.º 21
0
def predict(
    m_xgb: xgboost.XGBClassifier,
    m_lgbm: lightgbm.Booster,
    test: pd.DataFrame,
    test_previous: pd.DataFrame,
    user_summary: "UserSummary",
    question_features: pd.DataFrame,
) -> Tuple[pd.DataFrame]:
    """
    Predict the probability that the user will answer the current question correctly.

    Parameters
    ----------
    m: The model object, an xgboost classifier.
    test: The test data for which to generate predictions.
    test_previous: The previous group of test data observations, used to update
        user summary statistics.
    user_summary: A UserSummary object containing user features, that can be updated
        with incoming data.
    question_features: Question features to join on content_id.

    Returns
    -------
    A tuple of (prediction dataframe, timer dataframe). The timer dataframe is produced
    to help identify bottlenecks in the prediction pipeline that may cause a timeout
    on Kaggle.
    """
    timer = {}
    if test_previous is not None:
        tic = datetime.utcnow()
        newdata = process_test_observations(test, test_previous,
                                            question_features)
        toc = datetime.utcnow()
        timer["process_test_observations"] = (toc - tic).total_seconds()

        tic = datetime.utcnow()
        user_summary.update(newdata)
        toc = datetime.utcnow()
        timer["update_user_summary"] = (toc - tic).total_seconds()

    test = test.loc[test["content_type_id"] == 0].drop(
        columns="content_type_id")
    tic = datetime.utcnow()
    test = pd.merge(
        test,
        question_features,
        how="left",
        left_on="content_id",
        right_index=True,
        copy=False,
    )
    toc = datetime.utcnow()
    timer["merge_question_features"] = (toc - tic).total_seconds()

    tic = datetime.utcnow()
    required_columns = [
        k for k in constants.USER_SUMMARY_SCHEMA.keys() if k != "user_id"
    ]
    for col in required_columns:
        test[col] = [
            user_summary.get_feature(user_id, col)
            for user_id in test["user_id"]
        ]
    calculate_user_features(test, inplace=True)
    toc = datetime.utcnow()
    timer["merge_user_features"] = (toc - tic).total_seconds()

    tic = datetime.utcnow()
    # test["answered_correctly"] = m_xgb.predict_proba(test[constants.TRAIN_COLS])[:, 1]
    test["answered_correctly"] = m_lgbm.predict(test[constants.TRAIN_COLS])
    toc = datetime.utcnow()
    timer["prediction"] = (toc - tic).total_seconds()

    return test, pd.DataFrame(timer, index=[0])
Ejemplo n.º 22
0
    def _load(properties):
        """Load a LGBMExplainableModel from the given properties.

        :param properties: A serialized dictionary representation of the LGBMExplainableModel.
        :type properties: dict
        :return: The deserialized LGBMExplainableModel.
        :rtype: interpret_community.mimic.models.LGBMExplainableModel
        """
        # create the LGBMExplainableModel without any properties using the __new__ function, similar to pickle
        lgbm_model = LGBMExplainableModel.__new__(LGBMExplainableModel)
        # Get _n_features
        _n_features = properties.pop(_N_FEATURES)
        # If classification case get _n_classes
        if json.loads(properties[LightGBMSerializationConstants.MULTICLASS]):
            _n_classes = properties.pop(_N_CLASSES)
        fitted_ = None
        if _FITTED in properties:
            fitted_ = json.loads(properties[_FITTED])
        elif version.parse('3.3.1') <= version.parse(lightgbm.__version__):
            # If deserializing older model in newer version set this to true to prevent errors on calls
            fitted_ = True
        # load all of the properties
        for key, value in properties.items():
            # Regenerate the properties on the fly
            if key in LightGBMSerializationConstants.nonify_properties:
                if key == LightGBMSerializationConstants.LOGGER:
                    parent = logging.getLogger(__name__)
                    lightgbm_identity = json.loads(
                        properties[LightGBMSerializationConstants.IDENTITY])
                    lgbm_model.__dict__[key] = parent.getChild(
                        lightgbm_identity)
                elif key == LightGBMSerializationConstants.TREE_EXPLAINER:
                    lgbm_model.__dict__[key] = None
                else:
                    raise Exception(
                        "Unknown nonify key on deserialize in LightGBMExplainableModel: {}"
                        .format(key))
            elif key in LightGBMSerializationConstants.save_properties:
                # Load the booster from file and re-create the LGBMClassifier or LGBMRegressor
                # This is not recommended but can be necessary to get around pickle being not secure
                # See here for more info:
                # https://github.com/Microsoft/LightGBM/issues/1942
                # https://github.com/Microsoft/LightGBM/issues/1217
                booster_args = {
                    LightGBMSerializationConstants.MODEL_STR: value
                }
                is_multiclass = json.loads(
                    properties[LightGBMSerializationConstants.MULTICLASS])
                if is_multiclass:
                    objective = LightGBMSerializationConstants.MULTICLASS
                else:
                    objective = LightGBMSerializationConstants.REGRESSION
                if LightGBMSerializationConstants.MODEL_STR in inspect.getargspec(
                        Booster).args:
                    extras = {
                        LightGBMSerializationConstants.OBJECTIVE: objective
                    }
                    lgbm_booster = Booster(**booster_args, params=extras)
                else:
                    # For backwards compatibility with older versions of lightgbm
                    booster_args[
                        LightGBMSerializationConstants.OBJECTIVE] = objective
                    lgbm_booster = Booster(params=booster_args)
                if is_multiclass:
                    new_lgbm = LGBMClassifier()
                    new_lgbm._Booster = lgbm_booster
                    new_lgbm._n_classes = _n_classes
                else:
                    new_lgbm = LGBMRegressor()
                    new_lgbm._Booster = lgbm_booster
                # Specify fitted_ for newer versions of lightgbm on deserialize
                if fitted_ is not None:
                    new_lgbm.fitted_ = fitted_
                new_lgbm._n_features = _n_features
                lgbm_model.__dict__[key] = new_lgbm
            elif key in LightGBMSerializationConstants.enum_properties:
                # NOTE: If more enums added in future, will need to handle this differently
                lgbm_model.__dict__[key] = ShapValuesOutput(json.loads(value))
            else:
                lgbm_model.__dict__[key] = json.loads(value)
        return lgbm_model
Ejemplo n.º 23
0
    def model_evaluate(self,
                       dt: pd.DataFrame,
                       prob: float = 0.5,
                       model: lgb.Booster = None):
        """
        Evaluate model on given data frame.

        Produce probability plots, AUC, average PR, F1, Precision, Recall and confusion matrix.

        Args:
            dt: data frame with labels and scores to evaluate
            prob: threshold to count probabilities as ones
            model: model to evaluate
        """
        if not model:
            model = self.lgb_model

        dt_eval = dt
        dt_eval["preds"] = model.predict(dt_eval[model.feature_name()])
        dt_eval["preds"].head()

        sns.distplot(dt_eval["preds"], axlabel='Full distribution')
        plt.show()
        sns.distplot(dt_eval.loc[dt_eval['label'] == 1, "preds"],
                     axlabel='Ones distribution')
        plt.show()
        sns.distplot(dt_eval.loc[dt_eval['label'] == 0, "preds"],
                     axlabel='Zeros distribution')
        plt.show()
        sns.distplot(dt_eval.loc[dt_eval['label'] == 1, "preds"],
                     axlabel='Ones distribution',
                     kde=False)
        sns.distplot(dt_eval.loc[dt_eval['label'] == 0, "preds"],
                     axlabel='Zeros distribution',
                     kde=False)
        plt.show()

        preds = [0 if x < prob else 1 for x in dt_eval["preds"]]
        cm = confusion_matrix(dt_eval['label'].values, preds)
        df_cm = pd.DataFrame(cm)
        sns.heatmap(df_cm, annot=True)
        plt.show()

        a_score = accuracy_score(dt_eval['label'].values,
                                 preds,
                                 normalize=True)
        print("Accuracy score: {}\n".format(a_score))

        class_report = classification_report(dt_eval['label'].values,
                                             preds,
                                             target_names=["Zeros", "Ones"])
        print(class_report)

        total = sum(dt_eval['label'].values)
        predicted = sum(preds)
        print("Total positive labels: {}. Positive labels predicted: {}\n".
              format(total, predicted))

        average_precision = average_precision_score(dt_eval['label'],
                                                    dt_eval['preds'])
        print('Average precision-recall score: {0:0.2f}'.format(
            average_precision))

        precision, recall, _ = precision_recall_curve(dt_eval['label'],
                                                      dt_eval['preds'],
                                                      pos_label=1)

        plt.step(recall, precision, color='b', alpha=0.2, where='post')
        plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
            average_precision))
        plt.show()
Ejemplo n.º 24
0
import pandas as pd
import numpy as np
import json
from lightgbm import Booster
from flask import Flask, jsonify, request, current_app

app = Flask(__name__)
app.config['MODEL'] = Booster(model_file="model.txt")


@app.route("/predict", methods=["GET", "POST"])
def predict():
    data = {"success": False}

    df = request.json
    if df is not None:

        df = pd.read_json(df)
        df["primary_use"] = pd.Categorical(df["primary_use"])
        df["meter"] = pd.Categorical(df["meter"])
        df["hour"] = pd.Categorical(df["hour"])
        df["weekday"] = pd.Categorical(df["weekday"])

        model = current_app.config['MODEL']
        data = {
            "success": True,
            "prediction": np.expm1(model.predict(df)).tolist()
        }

    return jsonify(data)
Ejemplo n.º 25
0
def mean_match_function_kdtree_cat(
    mmc,
    model: Booster,
    bachelor_features,
    candidate_values,
    random_state,
    hashed_seeds,
    candidate_preds=None,
):
    """
    This mean matching function selects categorical features by performing nearest
    neighbors on the output class probabilities. This tends to be more accurate, but
    takes more time, especially for variables with large number of classes.

    This function is slower for categorical datatypes, but results in better imputations.

        .. code-block:: text

            Mean match procedure for different datatypes:
                Categorical:
                    If mmc = 0, the class with the highest probability is chosen.
                    If mmc > 0, get N nearest neighbors from class probabilities.
                        Select 1 at random.
                Numeric:
                    If mmc = 0, the predicted value is used
                    If mmc > 0, obtain the mmc closest candidate
                        predictions and collect the associated
                        real candidate values. Choose 1 randomly.

    Parameters
    ----------
    mmc: int
        The number of mean matching candidates (derived from mean_match_candidates parameter)
    model: lgb.Booster
        The model that was trained.
    candidate_features: pd.DataFrame or np.ndarray
        The features used to train the model.
        If mmc == 0, this will be None.
    bachelor_features: pd.DataFrame or np.ndarray
        The features corresponding to the missing values of the response variable used to train
        the model.
    candidate_values:  pd.Series or np.ndarray
        The real (not predicted) values of the candidates from the original dataset.
        Will be 1D
        If the feature is pandas categorical, this will be the category codes.
    random_state: np.random.RandomState
        The random state from the process calling this function is passed.
    hashed_seeds: None, np.ndarray (int32)
        Used to make imputations deterministic at the record level. If this array
        is passed, random_state is ignored in favor of these seeds. These seeds are
        derived as a hash of the random_seed_array passed to the imputation functions.
        The distribution of these seeds is uniform enough.

    Returns
    -------
    The imputation values
    Must be np.ndarray or shape (n,), where n is the length of dimension 1 of bachelor_features.
    If the feature is categorical, return its category code (integer corresponding to its category).

    """

    objective = model.params["objective"]
    assert objective in _REGRESSIVE_OBJECTIVES + _CATEGORICAL_OBJECTIVES, (
        "lightgbm objective not recognized - please check for aliases or " +
        "define a custom mean matching function to handle this objective.")

    # Need these no matter what.
    bachelor_preds = model.predict(bachelor_features)

    if mmc == 0:

        if objective in _REGRESSIVE_OBJECTIVES:

            imp_values = bachelor_preds

        elif objective == "binary":

            imp_values = np.floor(bachelor_preds + 0.5)

        elif objective in ["multiclass", "multiclassova"]:

            imp_values = np.argmax(bachelor_preds, axis=1)

    else:

        if objective in _REGRESSIVE_OBJECTIVES:

            imp_values = _mean_match_reg(
                mmc,
                bachelor_preds,
                candidate_preds,
                candidate_values,
                random_state,
                hashed_seeds,
            )

        elif objective == "binary":

            bachelor_preds = logodds(bachelor_preds)

            imp_values = _mean_match_reg(
                mmc,
                bachelor_preds,
                candidate_preds,
                candidate_values,
                random_state,
                hashed_seeds,
            )

        elif objective in ["multiclass", "multiclassova"]:

            # inner_predict returns a flat array, need to reshape for KDTree
            bachelor_preds = logodds(bachelor_preds)

            imp_values = _mean_match_multiclass_accurate(
                mmc,
                bachelor_preds,
                candidate_preds,
                candidate_values,
                random_state,
                hashed_seeds,
            )

    return imp_values
Ejemplo n.º 26
0
 def load(self):
     with open(self.path_to_data, 'rb') as f:
         self.weekday_mean_data, self.hour_average = pickle.load(f)
     self.booster = Booster(model_file=self.path_to_weights)
Ejemplo n.º 27
0
def booster_fixture():
    package_path = os.path.dirname(s2cloudless.__file__)
    model_path = os.path.join(package_path, 'models', MODEL_FILENAME)
    return Booster(model_file=model_path)
Ejemplo n.º 28
0
    def predict(booster: lgb.Booster,
                dtest: pd.DataFrame,
                dist: str,
                pred_type: str,
                n_samples: int = 1000,
                quantiles: list = [0.1, 0.5, 0.9],
                seed: str = 123):
        '''A customized lightgbmlss prediction function.

        booster: lgb.Booster
            Trained LightGBMLSS-Model
        X: pd.DataFrame
            Test Data
        dist: str
            Specifies the distributional assumption.
        pred_type: str
            Specifies what is to be predicted:
                "response" draws n_samples from the predicted response distribution.
                "quantile" calculates the quantiles from the predicted response distribution.
                "parameters" returns the predicted distributional parameters.
                "expectiles" returns the predicted expectiles.
        n_samples: int
            If pred_type="response" specifies how many samples are drawn from the predicted response distribution.
        quantiles: list
            If pred_type="quantiles" calculates the quantiles from the predicted response distribution.
        seed: int
            If pred_type="response" specifies the seed for drawing samples from the predicted response distribution.

        '''

        dict_param = dist.param_dict()
        predt = booster.predict(dtest, raw_score=True)

        # Set init_score as starting point for each distributional parameter.
        init_score_pred = (np.ones(shape=(dtest.shape[0],
                                          1))) * dist.start_values

        dist_params_predts = []

        # The prediction result doesn't include the init_score specified in creating the train data.
        # Hence, it needs to be added manually with the corresponding transform for each distributional parameter.
        for i, (dist_param, response_fun) in enumerate(dict_param.items()):
            dist_params_predts.append(
                response_fun(predt[:, i] + init_score_pred[:, i]))

        dist_params_df = pd.DataFrame(dist_params_predts).T
        dist_params_df.columns = dict_param.keys()

        if pred_type == "parameters":
            return dist_params_df

        elif pred_type == "expectiles":
            return dist_params_df

        elif pred_type == "response":
            pred_resp_df = dist.pred_dist_rvs(pred_params=dist_params_df,
                                              n_samples=n_samples,
                                              seed=seed)

            pred_resp_df.columns = [
                str("y_pred_sample_") + str(i)
                for i in range(pred_resp_df.shape[1])
            ]
            return pred_resp_df

        elif pred_type == "quantiles":
            pred_quant_df = dist.pred_dist_quantile(quantiles=quantiles,
                                                    pred_params=dist_params_df)

            pred_quant_df.columns = [
                str("quant_") + str(quantiles[i])
                for i in range(len(quantiles))
            ]
            return pred_quant_df
Ejemplo n.º 29
0
def get_num_trees(booster: lgbm.Booster) -> int:
    return booster.current_iteration()
Ejemplo n.º 30
0
    def __init__(self, model_config_dict: dict, threads: int):
        """Initialise the tree model variables used in the application of RainForests
        Calibration.

        Args:
            model_config_dict:
                Dictionary containing Rainforests model configuration variables.
            threads:
                Number of threads to use during prediction with tree-model objects.

        Dictionary is of format::

            {
                "-50.0" : {
                    "lightgbm_model" : "<path_to_lightgbm_model_object>",
                    "treelite_model" : "<path_to_treelite_model_object>"
                },
                "-25.0" : {
                    "lightgbm_model" : "<path_to_lightgbm_model_object>",
                    "treelite_model" : "<path_to_treelite_model_object>"
                },
                ...,
                "50.0" : {
                    "lightgbm_model" : "<path_to_lightgbm_model_object>",
                    "treelite_model" : "<path_to_treelite_model_object>"
                }
            }

        The keys specify the error threshold value, while the associated values
        are the path to the corresponding tree-model objects for that threshold.

        Treelite predictors are used if treelite_runitme is an installed dependency
        and an associated path has been provided for all thresholds, otherwise lightgbm
        Boosters are used as the default tree model type.
        """
        from lightgbm import Booster

        try:
            from treelite_runtime import Predictor
        except ModuleNotFoundError:
            warnings.warn(
                "Module treelite_runtime unavailable. Defaulting to using lightgbm Boosters."
            )
            self.treelite_enabled = False
        else:
            self.treelite_enabled = True

        # Dictionary keys represent error thresholds, however may be strings as they
        # are sourced from json files. In order use these in processing, and to sort
        # them in a sensible fashion, we shall cast the key values as float32.
        sorted_model_config_dict = OrderedDict(
            sorted({np.float32(k): v for k, v in model_config_dict.items()}.items())
        )

        self.error_thresholds = np.array([*sorted_model_config_dict.keys()])

        lightgbm_model_filenames = [
            threshold_dict.get("lightgbm_model")
            for threshold_dict in sorted_model_config_dict.values()
        ]
        treelite_model_filenames = [
            threshold_dict.get("treelite_model")
            for threshold_dict in sorted_model_config_dict.values()
        ]
        if (None not in treelite_model_filenames) and self.treelite_enabled:
            self.tree_models = [
                Predictor(libpath=file, verbose=False, nthread=threads)
                for file in treelite_model_filenames
            ]
        else:
            if None in lightgbm_model_filenames:
                raise ValueError(
                    "Path to lightgbm model missing for one or more error thresholds "
                    "in model_config_dict."
                )
            self.tree_models = [
                Booster(model_file=file).reset_parameter({"num_threads": threads})
                for file in lightgbm_model_filenames
            ]