Esempio n. 1
0
def test_NumericalEncoder_default_and_null_values():
    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    df.index = np.arange(len(df))

    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df.loc[0:10, "cat_col_1"] = None

    # All modalities are kept, __null__ category is created
    encoder = NumericalEncoder(encoding_type="num",
                               min_modalities_number=2,
                               max_cum_proba=0.8,
                               max_na_percentage=0)

    res = encoder.fit_transform(df)
    assert "__default__" in encoder.model.variable_modality_mapping[
        "cat_col_1"]
    assert "__null__" in encoder.model.variable_modality_mapping["cat_col_1"]

    df["cat_col_1"] = "zzz"  # Never seen value
    res = encoder.transform(df)
    assert res["cat_col_1"].unique(
    )[0] == encoder.model.variable_modality_mapping["cat_col_1"]["__default__"]

    df["cat_col_1"] = None
    res = encoder.transform(df)
    assert res["cat_col_1"].unique(
    )[0] == encoder.model.variable_modality_mapping["cat_col_1"]["__null__"]
Esempio n. 2
0
def test_NumericalEncoder_nothing_to_do():
    df = get_sample_df(100)[["float_col", "int_col"]]

    encoder = NumericalEncoder()
    df_transformed = encoder.fit_transform(df)

    assert (df.values == df_transformed.values).all().all()
    assert (df.dtypes == df_transformed.dtypes).all()
Esempio n. 3
0
def test_NumericalEncoder_int_as_cat():
    df = get_sample_df(100)[["float_col", "int_col"]]
    df["int_cat"] = np.random.choice((0, 1, 2), 100)
    df["int_cat"] = df["int_cat"].astype("category")

    encoder = NumericalEncoder()
    df_transformed = encoder.fit_transform(df)

    assert "int_cat" not in df_transformed.columns
    assert df["int_cat"].nunique() + 2 == df_transformed.shape[1]
    assert df.loc[df["int_cat"] == 1,
                  "int_cat"].shape[0] == (df["int_cat"] == 1).sum()
Esempio n. 4
0
def test_NumericalEncoder_int_as_cat():
    df = get_sample_df(100)[['float_col', 'int_col']]
    df['int_cat'] = np.random.choice((0, 1, 2), 100)
    df['int_cat'] = df['int_cat'].astype('category')

    encoder = NumericalEncoder()
    df_transformed = encoder.fit_transform(df)

    assert 'int_cat' not in df_transformed.columns
    assert df['int_cat'].nunique() + 2 == df_transformed.shape[1]
    assert df.loc[df['int_cat'] == 1,
                  'int_cat'].shape[0] == (df['int_cat'] == 1).sum()
Esempio n. 5
0
def test_NumericalEncoder_with_boolean():
    dfX = pd.DataFrame({"c": [True, False] * 200})

    enc = NumericalEncoder()

    dfX_encoded = enc.fit_transform(dfX)

    assert "c__True" in dfX_encoded.columns
    assert "c__False" in dfX_encoded.columns
    assert ((dfX_encoded["c__True"] == 1) == (dfX["c"])).all()
    assert ((dfX_encoded["c__False"] == 1) == (~dfX["c"])).all()
    assert dfX_encoded["c__True"].dtype == np.int32
    assert dfX_encoded["c__False"].dtype == np.int32
Esempio n. 6
0
def test_NumericalEncoder_dummy_output_dtype():
    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    ind = np.arange(len(df))
    df.index = ind

    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6])

    encoder = NumericalEncoder(encoding_type="dummy")
    encoder.fit(df)
    res = encoder.transform(df)

    assert (res.dtypes[res.columns.str.startswith("cat_col_")] == "int32"
            ).all()  # check default encoding type = int32
Esempio n. 7
0
def test_NumericalEncoder_with_cat_dtypes():
    np.random.seed(123)
    X = get_sample_df(100)
    X["cat_col_1"] = X["text_col"].apply(lambda s: s[0:3])

    encoder = NumericalEncoder(columns_to_use=["cat_col_1"])
    X_no_cat_dtype_encoded = encoder.fit_transform(X)

    X_cat_dtype = X.copy()
    X_cat_dtype["cat_col_1"] = X_cat_dtype["cat_col_1"].astype("category")
    X_with_cat_dtype_encoded = encoder.fit_transform(X_cat_dtype)

    assert X_with_cat_dtype_encoded.shape == X_no_cat_dtype_encoded.shape
    assert (X_with_cat_dtype_encoded == X_no_cat_dtype_encoded).all().all()
    assert (X_with_cat_dtype_encoded.dtypes == X_no_cat_dtype_encoded.dtypes
            ).all()
Esempio n. 8
0
def test_NumericalEncoder_num_output_dtype():
    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    ind = np.arange(len(df))
    df.index = ind

    np.random.shuffle(ind)
    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6])

    encoder = NumericalEncoder(encoding_type="num")
    encoder.fit(df)
    res = encoder.transform(df)

    assert res.dtypes["cat_col_1"] == "int32"
    assert res.dtypes["cat_col_2"] == "int32"
Esempio n. 9
0
def test_NumericalEncoder_with_cat_dtypes():
    X = get_sample_df(100)
    X["cat_col_1"] = X["text_col"].apply(lambda s: s[0:3])
    y = 1 * (np.random.randn(100) > 0)
    np.random.seed(123)

    encoder = NumericalEncoder()
    X_no_cat_dtype_encoded = encoder.fit_transform(X)

    X_cat_dtype = X.copy()
    X_cat_dtype['cat_col_1'] = X_cat_dtype['cat_col_1'].astype('category')
    X_with_cat_dtype_encoded = encoder.fit_transform(X_cat_dtype)

    assert (X_with_cat_dtype_encoded == X_no_cat_dtype_encoded).all().all()
    assert (X_with_cat_dtype_encoded.dtypes == X_no_cat_dtype_encoded.dtypes
            ).all()
Esempio n. 10
0
def test_NumericalEncoder_num():

    ######################
    ### Numerical Mode ###
    ######################

    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    ind = np.arange(len(df))
    df.index = ind

    np.random.shuffle(ind)
    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6])

    encoder = NumericalEncoder(encoding_type="num")
    encoder.fit(df)
    res = encoder.transform(df)

    assert res.shape == df.shape
    assert (res.index == df.index).all()

    assert encoder.get_feature_names() == encoder.model._feature_names
    assert encoder.get_feature_names() == list(res.columns)

    df2 = df.copy()
    df2.loc[0, "cat_col_1"] = "something-new"
    df2.loc[1, "cat_col_2"] = None  # Something None

    res2 = encoder.transform(df2)
    assert res2.loc[0, "cat_col_1"] == -1
    assert res2.loc[1, "cat_col_2"] == -1

    df_with_none = df.copy()
    df_with_none["cat_col_3"] = df_with_none["cat_col_1"]
    df_with_none.loc[list(range(25)), "cat_col_3"] = None

    encoder2 = NumericalEncoder(encoding_type="num")
    res2 = encoder2.fit_transform(df_with_none)

    assert (df_with_none["cat_col_3"].isnull() == (
        res2["cat_col_3"] == 0)).all()
Esempio n. 11
0
def test_NumericalEncoder_encode_int():
    df = get_sample_df(100)[["float_col"]]
    df["int_col"] = np.random.choice((0, 1, 2), 100)

    encoder = NumericalEncoder(columns_to_use=["int_col"])
    df_transformed = encoder.fit_transform(df)

    df_copy = df.copy()
    df_copy["int_col"] = df_copy["int_col"].astype("category")

    encoder_2 = NumericalEncoder()
    df_copy_transformed = encoder_2.fit_transform(df_copy)

    assert (df_transformed.values == df_copy_transformed.values).all().all()
    assert (df_transformed.dtypes == df_copy_transformed.dtypes).all()
    assert df_transformed.shape[1] == 1 + df["int_col"].nunique()
Esempio n. 12
0
def test_NumericalEncoder_columns_to_encode_object():
    np.random.seed(123)
    Xnum = np.random.randn(1000, 10)

    dfX = pd.DataFrame(Xnum, columns=["col_%d" % i for i in range(10)])
    dfX["object_column"] = ["string_%2.4f" % x for x in dfX["col_0"]]

    # with --object--
    encoder = NumericalEncoder(columns_to_use="object")
    dfX_enc = encoder.fit_transform(dfX)

    assert not (dfX_enc.dtypes == "object").any()

    # with default behavior
    encoder = NumericalEncoder()
    dfX_enc = encoder.fit_transform(dfX)

    assert "object_column" in dfX_enc
    assert (dfX_enc["object_column"] == dfX["object_column"]).all()
Esempio n. 13
0
def test_NumericalEncoder_num_fit_parameters():

    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    df.index = np.arange(len(df))

    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[4:7])
    df["cat_col_3"] = df["text_col"].apply(lambda s: s[8:11])
    df.loc[0:10, "cat_col_3"] = None

    # All modalities are kept, __null__ category is created
    encoder = NumericalEncoder(
        encoding_type="num",
        min_modalities_number=10,
        max_modalities_number=100,
        max_na_percentage=0,
        min_nb_observations=1,
        max_cum_proba=1,
    )
    res = encoder.fit_transform(df)
    assert len(encoder.model.variable_modality_mapping["cat_col_1"]) == 7
    assert len(encoder.model.variable_modality_mapping["cat_col_3"]) == 8

    # We filter on max_cum_proba, __null__ category is created
    encoder = NumericalEncoder(
        encoding_type="num",
        min_modalities_number=1,
        max_modalities_number=100,
        max_na_percentage=0,
        min_nb_observations=1,
        max_cum_proba=0.6,
    )
    res = encoder.fit_transform(df)
    map1 = encoder.model.variable_modality_mapping["cat_col_1"]
    assert len(map1) == 5
    assert np.all(
        [v in map1 for v in ["eee", "bbb", "ddd", "jjj", "__default__"]])
    map3 = encoder.model.variable_modality_mapping["cat_col_3"]
    assert len(map3) == 6
    assert np.all([
        v in map3 for v in ["bbb", "ddd", "ccc", "aaa", "jjj", "__default__"]
    ])

    # No __null__ category
    encoder = NumericalEncoder(
        encoding_type="num",
        min_modalities_number=1,
        max_modalities_number=100,
        max_na_percentage=0.2,
        min_nb_observations=1,
        max_cum_proba=1,
    )
    res = encoder.fit_transform(df)
    assert len(encoder.model.variable_modality_mapping["cat_col_3"]) == 7

    # Max modalities
    encoder = NumericalEncoder(
        encoding_type="num",
        min_modalities_number=1,
        max_modalities_number=3,
        max_na_percentage=0.2,
        min_nb_observations=1,
        max_cum_proba=1,
    )
    res = encoder.fit_transform(df)
    assert len(encoder.model.variable_modality_mapping["cat_col_1"]) == 4
    assert len(encoder.model.variable_modality_mapping["cat_col_2"]) == 4
    assert len(encoder.model.variable_modality_mapping["cat_col_3"]) == 4

    assert res["cat_col_1"].nunique() == 4
    assert res["cat_col_2"].nunique() == 4
    assert res["cat_col_3"].nunique() == 4
Esempio n. 14
0
def test_NumericalEncoder_dummy():

    ####################
    ### One Hot Mode ###
    ####################

    np.random.seed(123)
    df = get_sample_df(100, seed=123)
    ind = np.arange(len(df))
    df.index = ind

    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
    df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6])

    encoder = NumericalEncoder(encoding_type="dummy")
    encoder.fit(df)
    res = encoder.transform(df)

    assert encoder.model._dummy_size == len(encoder.model._dummy_feature_names)
    assert encoder.model._dummy_size == sum(
        len(v) for k, v in encoder.model.variable_modality_mapping.items())

    assert res.shape[0] == df.shape[0]
    assert res.shape[1] == len(df["cat_col_1"].value_counts()) + len(
        df["cat_col_2"].value_counts()) + 3
    assert (res.index == df.index).all()

    col = ["float_col", "int_col", "text_col"]
    col1 = [
        "cat_col_1__%s" % c for c in list(df["cat_col_1"].value_counts().index)
    ]
    col2 = [
        "cat_col_2__%s" % c for c in list(df["cat_col_2"].value_counts().index)
    ]

    assert col1 == encoder.columns_mapping["cat_col_1"]
    assert col2 == encoder.columns_mapping["cat_col_2"]

    assert encoder.get_feature_names() == col + col1 + col2

    assert (res.loc[:, col1 + col2]).isnull().sum().sum() == 0
    assert (res.loc[:, col1 + col2]).max().max() == 1
    assert (res.loc[:, col1 + col2]).min().min() == 0

    assert ((df["cat_col_1"] == "aaa") == (res["cat_col_1__aaa"] == 1)).all()

    df2 = df.copy()
    df2.loc[0, "cat_col_1"] = "something-new"
    df2.loc[1, "cat_col_2"] = None  # Something None

    res2 = encoder.transform(df2)

    assert res2.loc[0, col1].sum() == 0  # no dummy activated
    assert res2.loc[
        0, "cat_col_2__" +
        df2.loc[0, "cat_col_2"]] == 1  # activated in the right position
    assert res2.loc[0, col2].sum() == 1  # only one dummy activate

    assert res2.loc[1, col2].sum() == 0  # no dummy activated
    assert res2.loc[
        1, "cat_col_1__" +
        df2.loc[1, "cat_col_1"]] == 1  # activated in the right position
    assert res2.loc[1, col1].sum() == 1

    df_with_none = df.copy()
    df_with_none["cat_col_3"] = df_with_none["cat_col_1"]
    df_with_none.loc[0:25, "cat_col_3"] = None

    encoder2 = NumericalEncoder(encoding_type="dummy")
    res2 = encoder2.fit_transform(df_with_none)

    col3b = [c for c in res2.columns if c.startswith("cat_col_3")]
    assert col3b[0] == "cat_col_3____null__"
    assert list(res2.columns) == col + col1 + col2 + col3b
    assert list(res2.columns) == encoder2.get_feature_names()

    assert (res2.loc[:, col1 + col2 + col3b]).isnull().sum().sum() == 0
    assert (res2.loc[:, col1 + col2 + col3b]).max().max() == 1
    assert (res2.loc[:, col1 + col2 + col3b]).min().min() == 0

    assert (df_with_none["cat_col_3"].isnull() == (
        res2["cat_col_3____null__"] == 1)).all()

    df3 = df.copy()
    df3["cat_col_many"] = [
        "m_%d" % x
        for x in np.ceil(np.minimum(np.exp(np.random.rand(100) *
                                           5), 50)).astype(np.int32)
    ]

    encoder3 = NumericalEncoder(encoding_type="dummy")
    res3 = encoder3.fit_transform(df3)

    colm = [c for c in res3.columns if c.startswith("cat_col_many")]
    vc = df3["cat_col_many"].value_counts()
    colmb = [
        "cat_col_many__" + c
        for c in list(vc.index[vc >= encoder3.min_nb_observations]) +
        ["__default__"]
    ]

    assert colm == colmb
Esempio n. 15
0
                                                   'MTA_DATE_weekday', 'MTA_DATE_week', 'TOTAL_SUM']
                                   ,
                                   raise_if_shape_differs=True,
                                   regex_match=False)

imputer = NumImputer(add_is_null=True, allow_unseen_null=True, columns_to_use='all',
                     drop_unused_columns=True, drop_used_columns=True, fix_value=0,
                     regex_match=False, strategy='mean')

numerical_encoder = NumericalEncoder(
    columns_to_use=['CLAIM3YEARS', 'P1_EMP_STATUS', 'P1_PT_EMP_STATUS', 'BUS_USE', 'CLERICAL', 'AD_BUILDINGS',
                    'AD_CONTENTS', 'CONTENTS_COVER', 'BUILDINGS_COVER', 'P1_MAR_STATUS', 'P1_POLICY_REFUSED', 'P1_SEX',
                    'APPR_ALARM', 'APPR_LOCKS', 'FLOODING', 'NEIGH_WATCH', 'OCC_STATUS', 'SAFE_INSTALLED',
                    'SEC_DISC_REQ', 'SUBSIDENCE', 'PAYMENT_METHOD', 'LEGAL_ADDON_PRE_REN', 'LEGAL_ADDON_POST_REN',
                    'HOME_EM_ADDON_PRE_REN', 'HOME_EM_ADDON_POST_REN', 'GARDEN_ADDON_PRE_REN', 'GARDEN_ADDON_POST_REN',
                    'KEYCARE_ADDON_PRE_REN', 'KEYCARE_ADDON_POST_REN', 'HP1_ADDON_PRE_REN', 'HP1_ADDON_POST_REN',
                    'HP2_ADDON_PRE_REN', 'HP2_ADDON_POST_REN', 'HP3_ADDON_PRE_REN', 'HP3_ADDON_POST_REN', 'MTA_FLAG'],
    desired_output_type='DataFrame', drop_unused_columns=True,
    drop_used_columns=True, encoding_type='dummy',
    max_cum_proba=0.95, max_modalities_number=100,
    max_na_percentage=0.05, min_modalities_number=20,
    min_nb_observations=10, regex_match=False)

binary_columns_cleaner = BinaryColumnsCleaner()

# this one does nothing but is used to use the pipeline without the classifier (for shap):
pass_through = PassThrough()
classifier = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                            importance_type='split', learning_rate=0.1, max_depth=-1,
                            min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                            n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
Esempio n. 16
0
def test_NumericalEncoder_drop_used_unused_columns(drop_used_columns,
                                                   drop_unused_columns,
                                                   columns_to_use):
    # This test will verify the behavior of the encoder regarding the fact to drop or keep the use/unused columns

    df = pd.DataFrame({
        "obj1": ["a", "b", "c", "d"] * 25,
        "obj2": ["AA", "BB"] * 50,
        "num1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 10,
        "num2": [100, 101, 102, 103, 104] * 20,
        "num3": [0.01, 0.02, 0.03, 0.04, 0.05] * 20,
    })

    df1 = df.loc[0:20, ]
    df2 = df.loc[20:]

    # for drop_used_columns, drop_unused_columns, columns_to_use in list(itertools.product((True,False),(True,False),("all","object",["num1","num2","num3"]))):

    resulting_columns = {
        col: ["%s__%s" % (col, str(v)) for v in df[col].value_counts().index]
        for col in df.columns
    }

    if columns_to_use == "all":
        cols = list(df.columns)
    elif columns_to_use == "object":
        cols = list(df.columns[df.dtypes == "object"])
    else:
        cols = columns_to_use

    if drop_used_columns:
        columns_A = []
    else:
        columns_A = cols

    columns_B = []
    for c in cols:
        columns_B += resulting_columns[c]

    if drop_unused_columns:
        columns_C = []
    else:
        columns_C = [c for c in df.columns if c not in cols]

    final_columns = columns_A + columns_C + columns_B

    encoder = NumericalEncoder(columns_to_use=columns_to_use,
                               drop_used_columns=drop_used_columns,
                               drop_unused_columns=drop_unused_columns)

    df1_transformed = encoder.fit_transform(df1)
    df2_transformed = encoder.transform(df2)

    assert df1_transformed.shape[0] == df1.shape[0]
    assert df2_transformed.shape[0] == df2.shape[0]
    assert type(df1_transformed) == type(df1)
    assert type(df2_transformed) == type(df2)
    assert (df1_transformed.index == df1.index).all()
    assert (df2_transformed.index == df2.index).all()

    assert df1_transformed.shape[1] == df2_transformed.shape[1]
    assert list(df1_transformed.columns) == list(df2_transformed.columns)

    assert len(df1_transformed.columns) == len(final_columns)
    assert set(df1_transformed) == set(final_columns)

    #    assert list(df1_transformed.columns) == final_columns

    encoder = NumericalEncoder()
    encoder.fit(df)

    pickled_encoder = pickle.dumps(encoder)
    unpickled_encoder = pickle.loads(pickled_encoder)

    assert type(unpickled_encoder) == type(encoder)
    X1 = encoder.transform(df)
    X2 = unpickled_encoder.transform(df)

    assert X1.shape == X2.shape
    assert (X1 == X2).all().all()
Esempio n. 17
0
    "logit":
    OutSamplerTransformer(LogisticRegression(), cv=cv),
    "pass":
    PassThrough(),
    "blender":
    LogisticRegression()
},
                        edges=[("rf", "blender"), ("lgbm", "blender"),
                               ("logit", "blender"), ("pass", "blender")])

# In[]
from aikit.transformers import NumImputer, CountVectorizerWrapper, NumericalEncoder

stacker = GraphPipeline(models={
    "enc":
    NumericalEncoder(),
    "imp":
    NumImputer(),
    "rf":
    OutSamplerTransformer(RandomForestClassifier(), cv=cv),
    "lgbm":
    OutSamplerTransformer(LGBMClassifier(), cv=cv),
    "logit":
    OutSamplerTransformer(LogisticRegression(), cv=cv),
    "blender":
    LogisticRegression()
},
                        edges=[("enc", "imp"), ("imp", "rf", "blender"),
                               ("imp", "lgbm", "blender"),
                               ("imp", "logit", "blender")])
Esempio n. 18
0
    def fit_metric_model(self):
        logger.info("start computing metric model...")

        ### Load the results
        df_results = self.result_reader.load_all_results(aggregate=True)

        self._nb_models_done = len(df_results)
        if self._nb_models_done <= self.min_nb_of_models:
            return self

        if (self._nb_models_done is not None
                and len(df_results) == self._nb_models_done
                and self.params_training_columns is not None):
            return self

        ### Load the params
        df_params = self.result_reader.load_all_params()

        df_merged_result = pd.merge(df_params,
                                    df_results,
                                    how="inner",
                                    on="job_id")

        training_cols = diff(list(df_params.columns), ["job_id"])

        # X dataframe for parameters
        dfX_params = df_merged_result.loc[:, training_cols]

        ### Retrive the target metric

        if self.avg_metrics:
            scorers = self.job_config.scoring
        else:
            scorers = [self.job_config.main_scorer
                       ]  # I'll use only the main_scorer

        N = dfX_params.shape[0]
        all_y_params = []
        for scorer in scorers:
            y_params = df_merged_result["test_%s" %
                                        scorer]  # Retrive the raw metric
            # replace NaN by scorer's observed minimum score ; if y_params contains
            # only NaN -> won't work
            y_params = y_params.fillna(y_params.min()).values

            if self.metric_transformation is None:
                pass

            elif self.metric_transformation == "rank":
                ### Transform in non-parametric rank ....
                y_params = kde_transfo_quantile(y_params)

                # => This behave likes a uniform law

            elif self.metric_transformation == "normal":
                ### Transform into non-parametric normal ...
                y_params = norm.ppf(kde_transfo_quantile(y_params))

                # => This behaves likes a normal law

            elif self.metric_transformation == "default":
                ### Transform using default transformation (log like function)
                try:
                    f = get_metric_default_transformation(scorer)
                except ValueError:
                    logger.info(
                        "I don't know how to transform this metric %s, I'll use default normal transformation"
                        % str(scorer))
                    f = None

                if f is None:
                    y_params = norm.ppf(kde_transfo_quantile(y_params))
                else:
                    y_params = f(y_params)

                if self.avg_metrics:
                    # If I'm averaging I'd rather have something centered
                    y_params = (y_params -
                                np.mean(y_params)) / np.std(y_params)

            else:
                raise ValueError("I don't know this metric_transformation %s" %
                                 self.metric_transformation)

            all_y_params.append(y_params.reshape((N, 1)))

        if len(all_y_params) > 1:
            y_params = np.concatenate(all_y_params, axis=1).mean(axis=1)
        else:
            y_params = all_y_params[0].reshape((N, ))

        #        elif self.metric_transformation
        #
        #
        #        else:
        #            # On peut aussi utiliser la transformation par default ?
        #            scorer = self.job_config.main_scorer
        #            y_params = df_merged_result["test_%s" % scorer].values
        #

        # create model
        transformer_model = GraphPipeline(models={
            "encoder": NumericalEncoder(),
            "imputer": NumImputer()
        },
                                          edges=[("encoder", "imputer")])

        xx_params = transformer_model.fit_transform(dfX_params)

        random_forest = RandomForestRegressor(n_estimators=100,
                                              min_samples_leaf=5)

        random_forest.fit(xx_params, y_params)

        random_forest_variance = RandomForestVariance(random_forest)
        random_forest_variance.fit(xx_params, y_params)

        self.params_training_columns = training_cols
        self.transformer_model = transformer_model
        self.random_forest = random_forest
        self.random_forest_variance = random_forest_variance

        self._nb_models_done = len(df_results)

        logger.info("metric model fitted")

        return self
Esempio n. 19
0
def load_sgdata():
    """ load the SG dataset """

    sg_df = pd.read_csv("cleandata.csv")
    #sg_df = pd.read_csv("/Users/samueladdotey/Local work/Personal Projects 2020/Applications/SG/SG Projects /data/cleandata.csv")
    sg_df = sg_df.loc[:, ~sg_df.columns.str.contains('^Unnamed')]

    # Shuffle DF and compute train/test split
    #df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    idx = int(len(sg_df) * (1 - 0.3))
    df_train = sg_df.loc[:idx]
    df_test = sg_df.loc[idx:]

    # Filter Y columns test and train data
    #Y train & test datasets
    y_train = df_train["product_solution"].to_frame()
    y_test = df_test["product_solution"].to_frame()

    #X drop uneeded columns from X test & train datasets
    del df_train["name"]
    del df_train["product_solution"]
    del df_test["product_solution"]
    del df_test["name"]

    print(type(df_train))

    # setting parameters for encoding x-values
    Xencoder = NumericalEncoder(columns_to_use=['industry', 'key_process'],
                                desired_output_type='DataFrame',
                                drop_unused_columns=False,
                                drop_used_columns=True,
                                encoding_type='num',
                                max_cum_proba=0.95,
                                max_modalities_number=100,
                                max_na_percentage=0.05,
                                min_modalities_number=20,
                                min_nb_observations=10,
                                regex_match=False)

    # encoding x values (industry, key process)
    Xencoded_train = Xencoder.fit_transform(df_train)
    Xencoded_test = Xencoder.fit_transform(df_test)

    # setting parameters for encoding y-values
    Yencoder = NumericalEncoder(columns_to_use=['product_solution'],
                                desired_output_type='DataFrame',
                                drop_unused_columns=False,
                                drop_used_columns=True,
                                encoding_type='num',
                                max_cum_proba=0.95,
                                max_modalities_number=100,
                                max_na_percentage=0.05,
                                min_modalities_number=20,
                                min_nb_observations=10,
                                regex_match=False)

    # encoding y values (product solutions)
    Yencoded_train = Yencoder.fit_transform(y_train)
    Yencoded_test = Yencoder.fit_transform(y_test)

    infos = {}
    return Xencoded_train, Yencoded_train, Xencoded_test, Yencoded_test