def fit(X, y, output_dir, **kwargs): """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data DataRobot runs this hook when the task is being trained inside a blueprint. As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit M-Estimate target encoder], that is then used to transform new data. The input parameters are passed by DataRobot based on project and blueprint configuration. Parameters ------- X: pd.DataFrame Training data that DataRobot passes when this task is being trained. y: pd.Series Project's target column (None is passed for unsupervised projects). output_dir: str A path to the output folder; the artifact [in this example - a pickle file containing a pre-fit M-Estimate target encoder] must be saved into this folder to be re-used in transform(). Returns ------- None fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir so that the trained object can be used during scoring inside transform() """ # Transform categorical columns into a numeric transformation using Weight of Evidence encoder_mest = ce.MEstimateEncoder(cols=X.columns, randomized=True, m=0.50) encoder_mest.fit(X, y) # dump the trained object # into an artifact [in this example - woe.pkl] # and save it into output_dir so that it can be used later to impute on new data output_dir_path = Path(output_dir) if output_dir_path.exists() and output_dir_path.is_dir(): with open("{}/mest.pkl".format(output_dir), "wb") as fp: pickle.dump(encoder_mest, fp)
def cal_moe(df_tr, col): enc = ce.MEstimateEncoder(cols=[col]).fit(df_tr.loc[::, feature_col], df_tr.loc[::, 'isDefault']) tmp = pd.DataFrame({ f'{col}': df_tr.loc[::, col], f'moe_{col}': enc.transform(df_tr.loc[::, feature_col], df_tr.loc[::, 'isDefault'])[col] }) return tmp.groupby([col])[f'moe_{col}'].mean(), f'moe_{col}'
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def test_reference_m0(self): x = ['A', 'A', 'B', 'B'] y = [1, 1, 0, 1] x_t = ['A', 'B', 'C'] encoder = encoders.MEstimateEncoder(m=0, handle_unknown='value', handle_missing='value') encoder.fit(x, y) scored = encoder.transform(x_t) expected = [[1], [0.5], [3. / 4.]] # The prior probability self.assertEqual(scored.values.tolist(), expected)
def get_encoder(self) -> BaseEstimator: return ce.MEstimateEncoder(cols=self.target_columns)
'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] # datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large... # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(handle_missing='value'), category_encoders.BaseNEncoder(handle_missing='indicator'), category_encoders.BinaryEncoder(handle_missing='value'), category_encoders.BinaryEncoder(handle_missing='indicator'), # category_encoders.HashingEncoder(handle_missing='value'), # category_encoders.HashingEncoder(handle_missing='indicator'), # category_encoders.HelmertEncoder(),
def encode_all(df,dfv,dfk,encoder_to_use,handle_missing='return_nan'): encoders_used = {} for col in encoder_to_use: if encoder_to_use[col] == 'ColumnDropper': df = df.drop(columns = col) dfv = dfv.drop(columns = col) dfk = dfk.drop(columns = col) encoders_used[col] = 'ColumnDropper' if encoder_to_use[col]=='BackwardDifferenceEncoder': encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='BaseNEncoder': encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=3) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='BinaryEncoder': encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='CatBoostEncoder': encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None,a=2) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder # if encoder_to_use[col]=='HashingEncoder': # encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) # encoder.fit(X=df,y=df['set_clicked']) # df=encoder.transform(df) # encoders_used[col]=encoder if encoder_to_use[col]=='HelmertEncoder': encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='JamesSteinEncoder': encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model='binary') encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='LeaveOneOutEncoder': encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='MEstimateEncoder': encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None,m=2) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) encoders_used[col]=encoder if encoder_to_use[col]=='OneHotEncoder': encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='OrdinalEncoder': encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='SumEncoder': encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='PolynomialEncoder': encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='TargetEncoder': encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=10, smoothing=5) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='WOEEncoder': encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder # print("Encoding done for - ",col) print("Completed encoder - ",datetime.datetime.now()) return df, dfv, dfk, encoders_used
datasets = ['audiology.arff', 'autos.arff', 'breast.cancer.arff', 'bridges.version1.arff', 'bridges.version2.arff', 'car.arff', 'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff', 'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff', 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), category_encoders.PolynomialEncoder(), category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv') # Loop over datasets, then over encoders, and finally, over the models for dataset_name in datasets: X, y, fold_count = arff_loader.load(dataset_name)
dct = df[i].value_counts() if next(iter(dct))/13731 <.66: value_count[i] = dict(df[i].value_counts()) if len(dct) > 10: hashes.append(i) else: te.append(i) else: drop_col.append(i) df.drop(drop_col, axis=1, inplace=True) cat_col = [x for x in cat_col if x not in drop_col] X = df.copy() y = df.Score X.drop('Score', axis=1, inplace=True) leoo = ce.LeaveOneOutEncoder(cols=te) me = ce.MEstimateEncoder(cols=hashes) X = leoo.fit_transform(X, y) X = me.fit_transform(X, y) X = pca(X, cat_col, 30) X.drop(cat_col, axis=1, inplace=True) rf = RandomForestRegressor() rf.fit(X,y) plt.plot(rf.feature_importances_) model_col = ['col84', 'col85', 'col917', 'col2612', 'col3109', 'col3108', 'col87', 'col118', 'col918', 'col2613', 'col3111', 'col88', 'col2614', 'pca0', 'pca1'] X = df[model_col] y = df.Score X_train, X_test, y_train, y_test = train_test_split(X[model_col], y, train_size=0.80) print(local_test(X_train, y_train, X_test, y_test, baseline_two))
def get_model(PARAMS): """return model for provided params :param PARAMS: dictionary with model params :type PARAMS: dicr :return: model pipeline :rtype: sklearn pipeline """ try: te_dict = { 'CatBoostEncoder': ce.CatBoostEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(), 'OneHotEncoder': ce.OneHotEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'WOEEncoder': ce.WOEEncoder(), 'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'CountEncoder': ce.CountEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'MEstimateEncoder': ce.MEstimateEncoder(), 'PolynomialEncoder': ce.PolynomialEncoder(), 'SumEncoder': ce.SumEncoder() } pipe = make_pipeline( helpers.PrepareData(extraxt_year=True, unicode_text=True), ColumnTransformer([ ('num', helpers.PassThroughOrReplace(), [ 'flat_size', 'rooms', 'floor', 'number_of_floors', 'year_of_building', 'GC_latitude', 'GC_longitude' ]), ('te_producer', te_dict.get(PARAMS['te_producer']), 'producer_name'), ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'), ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']), 'GC_addr_neighbourhood'), ('te_suburb', te_dict.get(PARAMS['te_suburb']), 'GC_addr_suburb'), ('te_postcode', te_dict.get(PARAMS['te_postcode']), 'GC_addr_postcode'), ('txt_name', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_name__ngram_range']), max_features=PARAMS['txt_name__max_features'], dtype=np.float32, binary=PARAMS['txt_name__binary'], use_idf=PARAMS['txt_name__use_idf']), 'name'), ('txt_dscr', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_dscr__ngram_range']), max_features=PARAMS['txt_dscr__max_features'], dtype=np.float32, binary=PARAMS['txt_dscr__binary'], use_idf=PARAMS['txt_dscr__use_idf']), 'description'), ]), TransformedTargetRegressor( regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed), func=np.log1p, inverse_func=np.expm1)) return pipe except BaseException as e: LOG.error(e) return None
test_mix = pd.concat([test_normal, df_anomaly]).reset_index().drop(['index'], axis=1) ######################################################################## df_normal_sample = train.sample(frac=sample_size) df_normal_sample.head(n=5) Y_train = df_normal_sample['class'] X_train = df_normal_sample.drop('class', axis = 1) test_mix_copy = test_mix.copy() #test_mix_copy['class'] = -1 Y_test = test_mix_copy['class'] X_test = test_mix.drop('class', axis = 1) ce_mestimator = ce.MEstimateEncoder() X_train_encode = ce_mestimator.fit_transform(X_train, Y_train) X_test_encode = ce_mestimator.fit_transform(X_test, Y_test) #X_train[:, 1] = LabelEncoder.fit_transform(X_train[:, 1]) #X_test_encode = LabelEncoder.fit_transform(X_test) train_x = StandardScaler().fit_transform(X_train_encode) train_x.shape test_x = StandardScaler().fit_transform(X_test_encode) test_x.shape #Autoencoder Layer Structure and Parameters
def generate_candidates(adjusted_cols): return [ ( "bsplitz_method", Pipeline([ ("cate", category_encoders.OrdinalEncoder(cols=adjusted_cols)), ( "ordinal_encoder", BsplitZClassifier(adjusted_cols, random_state=10, num_samples=100), ), ]), ), ( "target_encoder", Pipeline([ ( "target_encoder", category_encoders.TargetEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), ( "m_encoder", Pipeline([ ( "m_encoder", category_encoders.MEstimateEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), ( "cat_encoder", Pipeline([ ( "m_encoder", category_encoders.CatBoostEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), # ('backward_encoder', Pipeline([ # ('backward_encoder', category_encoders.BackwardDifferenceEncoder( # cols=adjusted_cols)), # ('clf', BsplitZClassifier())])), #skip because of too slow ( "basen_encoder", Pipeline([ ( "basen_encoder", category_encoders.BaseNEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), ( "binary_encoder", Pipeline([ ( "basen_encoder", category_encoders.BinaryEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), ( "count_encoder", Pipeline([ ( "basen_encoder", category_encoders.CountEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), # ('hashing_encoder', Pipeline([ # ('basen_encoder', category_encoders.HashingEncoder( # cols=adjusted_cols)), # ('clf', BsplitZClassifier())])), #skip because of too slow # ('woe_encoder', Pipeline([ # ('woe_encoder', category_encoders.WOEEncoder( # cols=adjusted_cols)), # ('clf', BsplitZClassifier())])), #skip because of binary target only ( "jamesstein_encoder", Pipeline([ ( "js_encoder", category_encoders.JamesSteinEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), # ('helmert_encoder', Pipeline([ # ('helmert_encoder', category_encoders.HelmertEncoder( # cols=adjusted_cols)), # ('clf', BsplitZClassifier())])), #skip because of too slow ]