'SPEC_SUM_INSURED', 'SPEC_ITEM_PREM', 'UNSPEC_HRP_PREM', 'BEDROOMS',
                                                   'ROOF_CONSTRUCTION', 'WALL_CONSTRUCTION', 'LISTED', 'MAX_DAYS_UNOCC',
                                                   'OWNERSHIP_TYPE', 'PAYING_GUESTS', 'PROP_TYPE', 'YEARBUILT',
                                                   'MTA_FAP', 'MTA_APRP', 'LAST_ANN_PREM_GROSS', 'QUOTE_DATE_day',
                                                   'QUOTE_DATE_month', 'QUOTE_DATE_year', 'QUOTE_DATE_weekday',
                                                   'QUOTE_DATE_week', 'COVER_START_day', 'COVER_START_month',
                                                   'COVER_START_year', 'COVER_START_weekday', 'COVER_START_week',
                                                   'P1_DOB_day', 'P1_DOB_month', 'P1_DOB_year', 'P1_DOB_weekday',
                                                   'P1_DOB_week', 'MTA_DATE_day', 'MTA_DATE_month', 'MTA_DATE_year',
                                                   'MTA_DATE_weekday', 'MTA_DATE_week', 'TOTAL_SUM']
                                   ,
                                   raise_if_shape_differs=True,
                                   regex_match=False)

imputer = NumImputer(add_is_null=True, allow_unseen_null=True, columns_to_use='all',
                     drop_unused_columns=True, drop_used_columns=True, fix_value=0,
                     regex_match=False, strategy='mean')

numerical_encoder = NumericalEncoder(
    columns_to_use=['CLAIM3YEARS', 'P1_EMP_STATUS', 'P1_PT_EMP_STATUS', 'BUS_USE', 'CLERICAL', 'AD_BUILDINGS',
                    'AD_CONTENTS', 'CONTENTS_COVER', 'BUILDINGS_COVER', 'P1_MAR_STATUS', 'P1_POLICY_REFUSED', 'P1_SEX',
                    'APPR_ALARM', 'APPR_LOCKS', 'FLOODING', 'NEIGH_WATCH', 'OCC_STATUS', 'SAFE_INSTALLED',
                    'SEC_DISC_REQ', 'SUBSIDENCE', 'PAYMENT_METHOD', 'LEGAL_ADDON_PRE_REN', 'LEGAL_ADDON_POST_REN',
                    'HOME_EM_ADDON_PRE_REN', 'HOME_EM_ADDON_POST_REN', 'GARDEN_ADDON_PRE_REN', 'GARDEN_ADDON_POST_REN',
                    'KEYCARE_ADDON_PRE_REN', 'KEYCARE_ADDON_POST_REN', 'HP1_ADDON_PRE_REN', 'HP1_ADDON_POST_REN',
                    'HP2_ADDON_PRE_REN', 'HP2_ADDON_POST_REN', 'HP3_ADDON_PRE_REN', 'HP3_ADDON_POST_REN', 'MTA_FLAG'],
    desired_output_type='DataFrame', drop_unused_columns=True,
    drop_used_columns=True, encoding_type='dummy',
    max_cum_proba=0.95, max_modalities_number=100,
    max_na_percentage=0.05, min_modalities_number=20,
    min_nb_observations=10, regex_match=False)
Exemple #2
0
    def fit_metric_model(self):
        logger.info("start computing metric model...")

        ### Load the results
        df_results = self.result_reader.load_all_results(aggregate=True)

        self._nb_models_done = len(df_results)
        if self._nb_models_done <= self.min_nb_of_models:
            return self

        if (self._nb_models_done is not None
                and len(df_results) == self._nb_models_done
                and self.params_training_columns is not None):
            return self

        ### Load the params
        df_params = self.result_reader.load_all_params()

        df_merged_result = pd.merge(df_params,
                                    df_results,
                                    how="inner",
                                    on="job_id")

        training_cols = diff(list(df_params.columns), ["job_id"])

        # X dataframe for parameters
        dfX_params = df_merged_result.loc[:, training_cols]

        ### Retrive the target metric

        if self.avg_metrics:
            scorers = self.job_config.scoring
        else:
            scorers = [self.job_config.main_scorer
                       ]  # I'll use only the main_scorer

        N = dfX_params.shape[0]
        all_y_params = []
        for scorer in scorers:
            y_params = df_merged_result["test_%s" %
                                        scorer]  # Retrive the raw metric
            # replace NaN by scorer's observed minimum score ; if y_params contains
            # only NaN -> won't work
            y_params = y_params.fillna(y_params.min()).values

            if self.metric_transformation is None:
                pass

            elif self.metric_transformation == "rank":
                ### Transform in non-parametric rank ....
                y_params = kde_transfo_quantile(y_params)

                # => This behave likes a uniform law

            elif self.metric_transformation == "normal":
                ### Transform into non-parametric normal ...
                y_params = norm.ppf(kde_transfo_quantile(y_params))

                # => This behaves likes a normal law

            elif self.metric_transformation == "default":
                ### Transform using default transformation (log like function)
                try:
                    f = get_metric_default_transformation(scorer)
                except ValueError:
                    logger.info(
                        "I don't know how to transform this metric %s, I'll use default normal transformation"
                        % str(scorer))
                    f = None

                if f is None:
                    y_params = norm.ppf(kde_transfo_quantile(y_params))
                else:
                    y_params = f(y_params)

                if self.avg_metrics:
                    # If I'm averaging I'd rather have something centered
                    y_params = (y_params -
                                np.mean(y_params)) / np.std(y_params)

            else:
                raise ValueError("I don't know this metric_transformation %s" %
                                 self.metric_transformation)

            all_y_params.append(y_params.reshape((N, 1)))

        if len(all_y_params) > 1:
            y_params = np.concatenate(all_y_params, axis=1).mean(axis=1)
        else:
            y_params = all_y_params[0].reshape((N, ))

        #        elif self.metric_transformation
        #
        #
        #        else:
        #            # On peut aussi utiliser la transformation par default ?
        #            scorer = self.job_config.main_scorer
        #            y_params = df_merged_result["test_%s" % scorer].values
        #

        # create model
        transformer_model = GraphPipeline(models={
            "encoder": NumericalEncoder(),
            "imputer": NumImputer()
        },
                                          edges=[("encoder", "imputer")])

        xx_params = transformer_model.fit_transform(dfX_params)

        random_forest = RandomForestRegressor(n_estimators=100,
                                              min_samples_leaf=5)

        random_forest.fit(xx_params, y_params)

        random_forest_variance = RandomForestVariance(random_forest)
        random_forest_variance.fit(xx_params, y_params)

        self.params_training_columns = training_cols
        self.transformer_model = transformer_model
        self.random_forest = random_forest
        self.random_forest_variance = random_forest_variance

        self._nb_models_done = len(df_results)

        logger.info("metric model fitted")

        return self
Exemple #3
0
    "pass":
    PassThrough(),
    "blender":
    LogisticRegression()
},
                        edges=[("rf", "blender"), ("lgbm", "blender"),
                               ("logit", "blender"), ("pass", "blender")])

# In[]
from aikit.transformers import NumImputer, CountVectorizerWrapper, NumericalEncoder

stacker = GraphPipeline(models={
    "enc":
    NumericalEncoder(),
    "imp":
    NumImputer(),
    "rf":
    OutSamplerTransformer(RandomForestClassifier(), cv=cv),
    "lgbm":
    OutSamplerTransformer(LGBMClassifier(), cv=cv),
    "logit":
    OutSamplerTransformer(LogisticRegression(), cv=cv),
    "blender":
    LogisticRegression()
},
                        edges=[("enc", "imp"), ("imp", "rf", "blender"),
                               ("imp", "lgbm", "blender"),
                               ("imp", "logit", "blender")])

stacker = GraphPipeline(models={
    "enc":