def __init__(self, cat_cols=None, drop_original: bool = False, encoder=OrdinalEncoder()): """ Categorical transformer. This is a wrapper for categorical encoders. :param cat_cols: :param drop_original: :param encoder: """ self.cat_cols = cat_cols self.drop_original = drop_original self.encoder = encoder self.default_encoder = OrdinalEncoder()
def fit(self, X_df, y=None): def regroup_cat(X, liste): if X not in liste: return ('other') else: return (X) self.prop_to_keep = [ 'Apartment', 'Serviced apartment', 'Condominium', 'Loft' ] self.prop_transformer = TargetEncoder() self.prop_transformer.fit( X_df['property_type'].apply( lambda x: regroup_cat(x, self.prop_to_keep)), y) self.pol_to_keep = [ 'flexible', 'strict_14_with_grace_period', 'moderate', 'moderate_new' ] self.pol_transformer = TargetEncoder() self.pol_transformer.fit( X_df['cancellation_policy'].apply( lambda x: regroup_cat(x, self.pol_to_keep)), y) self.room_transformer = OrdinalEncoder() self.room_transformer.fit(X_df['room_type']) self.city_transformer = OneHotEncoder(handle_unknown='ignore') self.city_transformer.fit(pd.DataFrame(X_df['city_origin'])) # numeric_transformer = Pipeline(steps = [('impute', SimpleImputer(strategy='median'))]) return self
def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = [x[1] for x in switch.get('mapping')] column_mapping = self.fit_helmert_coding(values) mappings_out.append({ 'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def create_regression_pipeline(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) numerical_indexes = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) non_numerical_indexes = np.array([], int) one_hot_indexes_after_handle_missing_values = np.array([], int) ordinal_indexes_after_handle_missing_values = np.array([], int) pipeline = Pipeline(steps=[ ( "handle_missing_values", ColumnTransformer( [ ("imputer_mean", SimpleImputer(strategy="mean"), numerical_indexes), ( "imputer_mode", SimpleImputer(strategy="most_frequent"), non_numerical_indexes, ), ], remainder="drop", ), ), ( "handle categorical features", ColumnTransformer( [ ( "feature_encoder_ordinal", OrdinalEncoder(), ordinal_indexes_after_handle_missing_values, ), ( "feature_encoder_onehot", OneHotEncoder(), one_hot_indexes_after_handle_missing_values, ), ], remainder="passthrough", ), ), ("estimator", LinearRegression(fit_intercept=True)), ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) return { 'features_train': X_train, 'features_test': X_test, 'target_train': y_train, 'target_test': y_test, 'target_predicted': y_pred, 'regression_pipeline': pipeline }
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = [x[1] for x in switch.get('mapping')] column_mapping = self.fit_backward_difference_coding(values) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def __init__(self, verbose=0, cols=None, drop_invariant=False): """ :param verbose: :param cols: :return: """ self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
def ordinal_encode(self,data): """ 複数のカテゴリ変数をベクトル化して、それぞれ変換規則を保存する関数です。 ベクトル化したデータセットを返します。 変換規則はenc_dictに保存されています。 :param data: 学習で用いるデータセット(Dataset型の属性dataを受け取る) """ org_order=data.columns #self.enc_dict={} self.model=OrdinalEncoder(cols=self.columns,handle_unknown="inpute") oe_data=self.model.fit_transform(data) oe_data=oe_data.ix[:,org_order] return oe_data
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def get_single_encoder(encoder_name: str, cat_cols: list): """ Get encoder by its name :param encoder_name: Name of desired encoder :param cat_cols: Cat columns for encoding :return: Categorical encoder """ if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "OneHotEncoder": encoder = OneHotEncoder(cols=cat_cols) if encoder is None: raise NotImplementedError("To be implemented") return encoder
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): self._dim = X.shape[1] if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) return self
def create_clustering_pipeline(X): numerical_indexes = np.array([0, 1, 2, 3]) non_numerical_indexes = np.array([], int) non_numerical_indexes_after_handle_missing_values = np.array([], int) pipeline = Pipeline(steps=[ ( "handle_missing_values", ColumnTransformer( [ ("imputer_mean", SimpleImputer(strategy="mean"), numerical_indexes), ( "imputer_mode", SimpleImputer(strategy="most_frequent"), non_numerical_indexes, ), ], remainder="drop", ), ), ( "handle_categorical_features", ColumnTransformer( [( "feature_encoder", OrdinalEncoder(), non_numerical_indexes_after_handle_missing_values, )], remainder="passthrough", ), ), ( "estimator", KMeans(n_clusters=3, n_init=10, max_iter=300), ), ]) _ = pipeline.fit_transform(X) clusters = pipeline.named_steps.estimator.labels_ return {'clusters': clusters, 'clustering_pipeline': pipeline}
def cross_validate_forest(X_train, y_train, pipes, grids, kfolds, random_search=False): """Cross-validate RandomForestClassifier pipeline.""" pipes['forest'] = make_pipeline(CategoricalToString(), SimpleDataFrameImputer(median_cols=['Age', 'Fare'], mode_cols=['Embarked']), OrdinalEncoder(cols=['Title', 'Deck', 'Embarked'], handle_unknown='impute'), RandomForestClassifier(**{'bootstrap': True, 'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 64, 'random_state': RANDOM_SEED})) if random_search: n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_features': max_features, 'randomforestclassifier__max_depth': max_depth, 'randomforestclassifier__min_samples_split': min_samples_split, 'randomforestclassifier__min_samples_leaf': min_samples_leaf, 'randomforestclassifier__bootstrap': bootstrap} pprint.pprint(random_grid) randsearch = RandomizedSearchCV(pipes['forest'], random_grid, n_iter=50, cv=3, verbose=0, random_state=42) start = time.time() randsearch.fit(X_train, y_train) finish = time.time() print('randsearch.fit execution time:', finish - start) pprint.pprint(randsearch.best_params_) forest = ExtendedClassifier.cross_validate(pipes['forest'], X_train, y_train, sklearn_cvs_kws={'cv': kfolds}, param_strategy='init', logdir_path=r'logs/models/forest', serialize_to=r'models/forest.pickle') return forest
def get_single_encoder(encoder_name: str, cat_cols: list): if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == 'OneHotEncoder': encoder = OneHotEncoder(cols=cat_cols) # assert encoder is not None return encoder
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def ordinal_encoder(self, df, configger): """ :param df: the train dataset. :param configger: the json str of configger setting, the params means: verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). mapping: list of dict a mapping of class to label to use for the encoding, optional. the dict contains the keys 'col' and 'mapping'. the value of 'col' should be the feature name. the value of 'mapping' should be a dictionary of 'original_label' to 'encoded_label'. example mapping: [{'col': 'col1', 'mapping': {None: 0, 'a': 1, 'b': 2}}] handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1. handle_missing: str options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category at fit time, or -2 at transform time if nan is not a category during fit. :return: the transform result """ X, y, encode_col = self.get_Xy(df, configger) drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True) handle_missing = set_default_vale("handle_missing", configger, "value") handle_unknown = set_default_vale("handle_unknown", configger, "value") encoder = OrdinalEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True, handle_unknown=handle_unknown, handle_missing=handle_missing) res = encoder.fit_transform(X, y) return res
def fetch_TICTACTOE(path, valid_size=0.2, test_size=0.2, seed=None): path = Path(path) data_path = path / 'tic-tac-toe.data' if not data_path.exists(): path.mkdir(parents=True, exist_ok=True) download( 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data', data_path) data = pd.read_csv(data_path, names=np.arange(10)) encoder = OrdinalEncoder(return_df=False) data = encoder.fit_transform(data) X, Y = (data[:, :-1]).astype(np.float32), (data[:, -1] - 1).astype(int) X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=test_size, random_state=seed) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=valid_size / (1 - test_size), random_state=seed) return dict(X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, X_test=X_test, y_test=y_test)
def EncodeCategoricalData(train_df, test_df): """encode data with OrdinalEncoder Parameters ---------- train_df: dataframe training dataframe object to fit and transform test_df: dataframe test dataframe object to transform Returns ------- transformed training and test dataframe """ # column list to ordinal encode ordinal_encode_cols = ["country", "province", "region_1", "taster_name", "variety"] # create ordinal encode object # object assigns -1 to the first-time-seen values of the test set ordinal_encoder = OrdinalEncoder(cols=ordinal_encode_cols, return_df=True, handle_unknown="value", handle_missing="return_nan") # fit object on the train dataset ordinal_encoder.fit(train_df) # transform train and test datasets ord_encoded_train = (ordinal_encoder .transform(train_df)) ord_encoded_test = (ordinal_encoder .transform(test_df)) return ord_encoded_train, ord_encoded_test
test_data = fg.generate() test_data.columns = [str(col) for col in test_data.columns] test_data = test_data.fillna(0) test_data = add_datepart(test_data, 'timestamp', drop=False) prediction = cat_model.predict(test_data) + lgb_model.predict(test_data) + xgb_model.predict(test_data) return prediction if __name__ == "__main__": path = 'C:/Work/gitsrc/Kaggle/data-science-bowl-2019' test = pd.read_csv(f'{path}/test.csv') preprocessedData = pd.read_csv("intermediate.csv") title_oe = OrdinalEncoder() title_oe.fit(list(set(preprocessedData['session_title'].unique()).union(set(test['title'].unique())))) world_oe = OrdinalEncoder() world_oe.fit(list(set(preprocessedData['world'].unique()).union(set(test['world'].unique())))) preprocessedData['session_title'] = title_oe.transform(preprocessedData['session_title'].values) preprocessedData['world'] = world_oe.transform(preprocessedData['world'].values) lgb_model, xgb_model, cat_model = train(preprocessedData) test['title'] = title_oe.transform(test['title'].values) test['world'] = world_oe.transform(test['world'].values) prediction = predict(test, lgb_model, xgb_model, cat_model) sample_submission = pd.read_csv(f'{path}/sample_submission.csv')
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping').get_values() column_mapping = self.fit_backward_difference_coding(values) mappings_out.append({ 'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') col = switch.get('col') column_mapping = self.fit_sum_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({ 'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X_df, y): collectivite = 'collectivite' annee = 'anneeBudg' obj = 'objet du dossier' direction = 'direction' nature = 'Nature de la subvention' beneficiaire = 'beneficiaire' secteur = 'secteur activite' drop_cols = ["numDoc", "siret"] def colect(X): return X.values[:, np.newaxis] colectivite_transformer = FunctionTransformer(colect, validate=False) def objetdoss(X): return X.values[:, np.newaxis] obj_transformer = FunctionTransformer(objetdoss, validate=False) def direct(X): return X.values[:, np.newaxis] direction_transformer = FunctionTransformer(direct, validate=False) def nature_t(X): return X.values[:, np.newaxis] nature_transformer = FunctionTransformer(nature_t, validate=False) def beneficiaire_t(X): return X.values[:, np.newaxis] beneficiaire_transformer = FunctionTransformer(beneficiaire_t, validate=False) def secteur_t(X): return X.values[:, np.newaxis] secteur_transformer = FunctionTransformer(secteur_t, validate=False) def annee_t(X): return X.values[:, np.newaxis] annee_transformer = FunctionTransformer(annee_t, validate=False) preprocessor = ColumnTransformer(transformers=[ ('col', make_pipeline(colectivite_transformer, OrdinalEncoder(), SimpleImputer(strategy='median')), collectivite), ('annee', make_pipeline(annee_transformer, SimpleImputer( strategy='median')), annee), ('dir', make_pipeline(direction_transformer, OrdinalEncoder(), SimpleImputer(strategy='median')), direction), ('nature', make_pipeline(nature_transformer, OrdinalEncoder(), SimpleImputer(strategy='median')), nature), ('beneficiaire', make_pipeline(beneficiaire_transformer, OrdinalEncoder(), SimpleImputer(strategy='median')), beneficiaire), ('sect', make_pipeline(secteur_transformer, OrdinalEncoder(), SimpleImputer(strategy='median')), secteur), ('obj', make_pipeline(obj_transformer, OrdinalEncoder(), SimpleImputer(strategy='median')), obj), ('drop cols', 'drop', drop_cols), ]) self.preprocessor = preprocessor self.preprocessor.fit(X_df, y) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training if self.model == 'independent': self.mapping = self._train_independent(X_ordinal, y) elif self.model == 'pooled': self.mapping = self._train_pooled(X_ordinal, y) elif self.model == 'beta': self.mapping = self._train_beta(X_ordinal, y) elif self.model == 'binary': # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: raise ValueError( "The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") if y.isnull().any(): raise ValueError( "The target column y must not contain missing values.") if np.max(unique) < 1: raise ValueError( "The target column y must be binary with values {0, 1}. Value 1 was not found in the target." ) if np.min(unique) > 0: raise ValueError( "The target column y must be binary with values {0, 1}. Value 0 was not found in the target." ) # Perform the training self.mapping = self._train_log_odds_ratio(X_ordinal, y) else: raise ValueError("model='" + str(self.model) + "' is not a recognized option") X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def __init__(self, cat_cols=None, drop_original: bool = False, encoder=OrdinalEncoder()): self.cat_cols = cat_cols self.drop_original = drop_original self.encoder = encoder self.default_encoder = OrdinalEncoder()
def fit(self, X): for column in X.columns: if X[column].dtype.name in ["object", "category"]: self.label_encoders[column] = OrdinalEncoder() self.label_encoders[column].fit(X[column]) return self