'SPEC_SUM_INSURED', 'SPEC_ITEM_PREM', 'UNSPEC_HRP_PREM', 'BEDROOMS', 'ROOF_CONSTRUCTION', 'WALL_CONSTRUCTION', 'LISTED', 'MAX_DAYS_UNOCC', 'OWNERSHIP_TYPE', 'PAYING_GUESTS', 'PROP_TYPE', 'YEARBUILT', 'MTA_FAP', 'MTA_APRP', 'LAST_ANN_PREM_GROSS', 'QUOTE_DATE_day', 'QUOTE_DATE_month', 'QUOTE_DATE_year', 'QUOTE_DATE_weekday', 'QUOTE_DATE_week', 'COVER_START_day', 'COVER_START_month', 'COVER_START_year', 'COVER_START_weekday', 'COVER_START_week', 'P1_DOB_day', 'P1_DOB_month', 'P1_DOB_year', 'P1_DOB_weekday', 'P1_DOB_week', 'MTA_DATE_day', 'MTA_DATE_month', 'MTA_DATE_year', 'MTA_DATE_weekday', 'MTA_DATE_week', 'TOTAL_SUM'] , raise_if_shape_differs=True, regex_match=False) imputer = NumImputer(add_is_null=True, allow_unseen_null=True, columns_to_use='all', drop_unused_columns=True, drop_used_columns=True, fix_value=0, regex_match=False, strategy='mean') numerical_encoder = NumericalEncoder( columns_to_use=['CLAIM3YEARS', 'P1_EMP_STATUS', 'P1_PT_EMP_STATUS', 'BUS_USE', 'CLERICAL', 'AD_BUILDINGS', 'AD_CONTENTS', 'CONTENTS_COVER', 'BUILDINGS_COVER', 'P1_MAR_STATUS', 'P1_POLICY_REFUSED', 'P1_SEX', 'APPR_ALARM', 'APPR_LOCKS', 'FLOODING', 'NEIGH_WATCH', 'OCC_STATUS', 'SAFE_INSTALLED', 'SEC_DISC_REQ', 'SUBSIDENCE', 'PAYMENT_METHOD', 'LEGAL_ADDON_PRE_REN', 'LEGAL_ADDON_POST_REN', 'HOME_EM_ADDON_PRE_REN', 'HOME_EM_ADDON_POST_REN', 'GARDEN_ADDON_PRE_REN', 'GARDEN_ADDON_POST_REN', 'KEYCARE_ADDON_PRE_REN', 'KEYCARE_ADDON_POST_REN', 'HP1_ADDON_PRE_REN', 'HP1_ADDON_POST_REN', 'HP2_ADDON_PRE_REN', 'HP2_ADDON_POST_REN', 'HP3_ADDON_PRE_REN', 'HP3_ADDON_POST_REN', 'MTA_FLAG'], desired_output_type='DataFrame', drop_unused_columns=True, drop_used_columns=True, encoding_type='dummy', max_cum_proba=0.95, max_modalities_number=100, max_na_percentage=0.05, min_modalities_number=20, min_nb_observations=10, regex_match=False)
def fit_metric_model(self): logger.info("start computing metric model...") ### Load the results df_results = self.result_reader.load_all_results(aggregate=True) self._nb_models_done = len(df_results) if self._nb_models_done <= self.min_nb_of_models: return self if (self._nb_models_done is not None and len(df_results) == self._nb_models_done and self.params_training_columns is not None): return self ### Load the params df_params = self.result_reader.load_all_params() df_merged_result = pd.merge(df_params, df_results, how="inner", on="job_id") training_cols = diff(list(df_params.columns), ["job_id"]) # X dataframe for parameters dfX_params = df_merged_result.loc[:, training_cols] ### Retrive the target metric if self.avg_metrics: scorers = self.job_config.scoring else: scorers = [self.job_config.main_scorer ] # I'll use only the main_scorer N = dfX_params.shape[0] all_y_params = [] for scorer in scorers: y_params = df_merged_result["test_%s" % scorer] # Retrive the raw metric # replace NaN by scorer's observed minimum score ; if y_params contains # only NaN -> won't work y_params = y_params.fillna(y_params.min()).values if self.metric_transformation is None: pass elif self.metric_transformation == "rank": ### Transform in non-parametric rank .... y_params = kde_transfo_quantile(y_params) # => This behave likes a uniform law elif self.metric_transformation == "normal": ### Transform into non-parametric normal ... y_params = norm.ppf(kde_transfo_quantile(y_params)) # => This behaves likes a normal law elif self.metric_transformation == "default": ### Transform using default transformation (log like function) try: f = get_metric_default_transformation(scorer) except ValueError: logger.info( "I don't know how to transform this metric %s, I'll use default normal transformation" % str(scorer)) f = None if f is None: y_params = norm.ppf(kde_transfo_quantile(y_params)) else: y_params = f(y_params) if self.avg_metrics: # If I'm averaging I'd rather have something centered y_params = (y_params - np.mean(y_params)) / np.std(y_params) else: raise ValueError("I don't know this metric_transformation %s" % self.metric_transformation) all_y_params.append(y_params.reshape((N, 1))) if len(all_y_params) > 1: y_params = np.concatenate(all_y_params, axis=1).mean(axis=1) else: y_params = all_y_params[0].reshape((N, )) # elif self.metric_transformation # # # else: # # On peut aussi utiliser la transformation par default ? # scorer = self.job_config.main_scorer # y_params = df_merged_result["test_%s" % scorer].values # # create model transformer_model = GraphPipeline(models={ "encoder": NumericalEncoder(), "imputer": NumImputer() }, edges=[("encoder", "imputer")]) xx_params = transformer_model.fit_transform(dfX_params) random_forest = RandomForestRegressor(n_estimators=100, min_samples_leaf=5) random_forest.fit(xx_params, y_params) random_forest_variance = RandomForestVariance(random_forest) random_forest_variance.fit(xx_params, y_params) self.params_training_columns = training_cols self.transformer_model = transformer_model self.random_forest = random_forest self.random_forest_variance = random_forest_variance self._nb_models_done = len(df_results) logger.info("metric model fitted") return self
"pass": PassThrough(), "blender": LogisticRegression() }, edges=[("rf", "blender"), ("lgbm", "blender"), ("logit", "blender"), ("pass", "blender")]) # In[] from aikit.transformers import NumImputer, CountVectorizerWrapper, NumericalEncoder stacker = GraphPipeline(models={ "enc": NumericalEncoder(), "imp": NumImputer(), "rf": OutSamplerTransformer(RandomForestClassifier(), cv=cv), "lgbm": OutSamplerTransformer(LGBMClassifier(), cv=cv), "logit": OutSamplerTransformer(LogisticRegression(), cv=cv), "blender": LogisticRegression() }, edges=[("enc", "imp"), ("imp", "rf", "blender"), ("imp", "lgbm", "blender"), ("imp", "logit", "blender")]) stacker = GraphPipeline(models={ "enc":