print(df.isnull().sum()) print(df.info()) print(df['score'].value_counts()) # # Change Categorical values for Ordinal Variables # scores={'Low': 0,'Medium' : 1,'High': 2} # df['score']=df['score'].map(scores) # #same thing in one line of code # # df['score']=df['score'].map({'Low': 0,'Medium' : 1,'High': 2}) # print(df.head()) encoder = ce.OrdinalEncoder(cols=['score'], return_df=True, mapping=[{ 'col': 'score', 'mapping': { 'Low': 0, 'Medium': 1, 'High': 2 } }]) newDF = encoder.fit_transform(df) #Nominal Categorical Variables #1 Pandas get_dummies df_dummies = pd.get_dummies(newDF, columns=['instructor', 'course', 'semester'], drop_first=True) print(df_dummies.head().T)
def run_bs_experiments(): print("Loading Data") df = load_data() #columns: continuous = ['temp', 'atemp', 'hum', 'windspeed'] categorical = [ 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit' ] X = df[continuous + categorical] y = df[['cnt']] models = [ Ridge(), RandomForestRegressor(n_estimators=100), GradientBoostingRegressor(), MLPRegressor() ] #models = [RandomForestRegressor()] results = [[ 'model', 'Encoder', 'R2', 'STD', 'Training Time', 'Sparsity', 'Dimensions' ]] for model in models: print("") print("----------------------") print("Testing Algorithm: ") print(type(model)) print("----------------------") #TargetEncoder print("TargetEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.TargetEncoder(return_df=False)) results.append([ type(model), 'TargetEncoder', r2, std, time, sparsity, dimensions ]) #OrdinalEncoder print("OrdinalEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.OrdinalEncoder(return_df=False)) results.append([ type(model), 'OrdinalEncoder', r2, std, time, sparsity, dimensions ]) #BinaryEncoder print("BinaryEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.BinaryEncoder(return_df=False)) results.append([ type(model), 'BinaryEncoder', r2, std, time, sparsity, dimensions ]) #HashingEncoder print("HashingEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.HashingEncoder(return_df=False)) results.append([ type(model), 'HashingEncoder', r2, std, time, sparsity, dimensions ]) #OneHotEncoder print("OneHotEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=OneHotEncoder(handle_unknown='ignore', sparse=False)) results.append([ type(model), 'OneHotEncoder', r2, std, time, sparsity, dimensions ]) print("GIG Encoder (mean) Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=GIGEncoder()) results.append([ type(model), 'GIGEncoder (m)', r2, std, time, sparsity, dimensions ]) print("GIG Encoder (mean and variance Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=GIGEncoder(), moments='mv') results.append([ type(model), 'GIGEncoder (mv)', r2, std, time, sparsity, dimensions ]) file = 'bike_sharing_experiments.csv' with open(file, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(results) try: upload_file(file) except: print("File Not Uploaded")
train.drop(["id", "rent"], axis=1, inplace=True) test.drop("id", axis=1, inplace=True) use_cols = [] #################### ## Preprocess data #################### ### location ### train["districts"] = train["location"].apply( lambda x: re.search("(?<=都)(.*?)(?=区)", x).group()) test["districts"] = test["location"].apply( lambda x: re.search("(?<=都)(.*?)(?=区)", x).group()) ce_ordinal = ce.OrdinalEncoder(cols=["districts"], handle_missing="value") train = ce_ordinal.fit_transform(train) test = ce_ordinal.transform(test) use_cols.append("districts") ### access ### train["mins_to_nearest_sta"] = train["access"].apply( lambda x: min(map(int, re.findall("(?<=徒歩)(.*?)(?=分)", x)))) test["mins_to_nearest_sta"] = test["access"].apply( lambda x: min(map(int, re.findall("(?<=徒歩)(.*?)(?=分)", x)))) use_cols.append("mins_to_nearest_sta") ### layout ### train["num_room"] = train["layout"].apply( lambda x: int(re.search("[0-9]", x).group())) test["num_room"] = test["layout"].apply(
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) min_count = np.min(np.unique(y, return_counts=True)[1]) if min_count < 9: self.params['cv_search'] = False if min_count < 3: self.params['grid_search_iterations'] = False self.params['cv_search'] = False # save pre-datatable-imputed X X_dt = X # Apply OOB imputation self.oob_imputer = OOBImpute(self._impute_num_type, self._impute_int_type, self._impute_bool_type, self._impute_cat_type, self._oob_bool, self._oob_cat) X = self.oob_imputer.fit_transform(X) # convert to pandas for sklearn X = X.to_pandas() X_orig_cols_names = list(X.columns) if self._kaggle_features: self.features = make_features() X = self.features.fit_transform(X) else: self.features = None # print("LR: pandas dtypes: %s" % (str(list(X.dtypes)))) # FEATURE GROUPS # Choose which features are numeric or categorical cat_features = [ x for x in X_orig_cols_names if CatOriginalTransformer.is_me_transformed(x) ] catlabel_features = [ x for x in X_orig_cols_names if CatTransformer.is_me_transformed(x) ] # can add explicit column name list to below force_cats force_cats = cat_features + catlabel_features # choose if numeric is treated as categorical if not self._num_as_cat: numerical_features = (X.dtypes == 'float') | ( X.dtypes == 'float32') | (X.dtypes == 'float64') else: numerical_features = X.dtypes == 'invalid' # force oob imputation for numerics self.oob_imputer = OOBImpute('oob', 'oob', 'oob', self._impute_cat_type, self._oob_bool, self._oob_cat) X = self.oob_imputer.fit_transform(X_dt) X = X.to_pandas() X = self.features.fit_transform(X) if self._kaggle_features: numerical_features = self.features.update_numerical_features( numerical_features) categorical_features = ~numerical_features # below can lead to overlap between what is numeric and what is categorical more_cats = (pd.Series([ True if x in force_cats else False for x in list(categorical_features.index) ], index=categorical_features.index)) categorical_features = (categorical_features) | (more_cats) if self._kaggle_features: categorical_features = self.features.update_categorical_features( categorical_features) if self._debug: import uuid struuid = str(uuid.uuid4()) Xy = X.copy() Xy.loc[:, 'target'] = y Xy.to_csv("munged_%s.csv" % struuid) cat_X = X.loc[:, categorical_features] num_X = X.loc[:, numerical_features] if self._debug: print("LR: Cat names: %s" % str(list(cat_X.columns))) print("LR: Num names: %s" % str(list(num_X.columns))) # TRANSFORMERS lr_params = copy.deepcopy(self.params) lr_params.pop('grid_search_by_iterations', None) lr_params.pop('cv_search', None) grid_search = False # WIP full_features_list = [] transformers = [] if self._use_numerics and any(numerical_features.values): impute_params = {} impute_params['strategy'] = lr_params.pop('strategy', 'mean') full_features_list.extend(list(num_X.columns)) transformers.append( (make_pipeline(SimpleImputer(**impute_params), StandardScaler()), numerical_features)) # http://contrib.scikit-learn.org/categorical-encoding/ if self._use_ordinal_encoding and any(categorical_features.values): ord_params = dict(handle_missing='value', handle_unknown='value') full_features_list.extend(list(cat_X.columns)) # Note: OrdinalEncoder doesn't handle unseen features, while CategoricalEncoder used too import category_encoders as ce transformers.append( (ce.OrdinalEncoder(**ord_params), categorical_features)) if self._use_catboost_encoding and any(categorical_features.values): cb_params = dict(handle_missing='value', handle_unknown='value') cb_params['sigma'] = lr_params.pop('sigma') full_features_list.extend(list(cat_X.columns)) import category_encoders as ce transformers.append( (ce.CatBoostEncoder(**cb_params), categorical_features)) if self._use_woe_encoding and any(categorical_features.values): woe_params = dict(handle_missing='value', handle_unknown='value') woe_params['randomized'] = lr_params.pop('randomized') woe_params['sigma'] = lr_params.pop('sigma_woe') woe_params['regularization'] = lr_params.pop('regularization') full_features_list.extend(list(cat_X.columns)) import category_encoders as ce transformers.append( (ce.WOEEncoder(**woe_params), categorical_features)) if self._use_target_encoding and any(categorical_features.values): te_params = dict(handle_missing='value', handle_unknown='value') te_params['min_samples_leaf'] = lr_params.pop('min_samples_leaf') te_params['smoothing'] = lr_params.pop('smoothing') full_features_list.extend(list(cat_X.columns)) import category_encoders as ce transformers.append( (ce.TargetEncoder(**te_params), categorical_features)) if self._use_target_encoding_other and any( categorical_features.values): full_features_list.extend(list(cat_X.columns)) len_uniques = [] cat_X_copy = cat_X.copy() for c in cat_X.columns: le = LabelEncoder() le.fit(cat_X[c]) cat_X_copy[c] = le.transform(cat_X_copy[c]) len_uniques.append(len(le.classes_)) if self._debug: uniques_series = pd.Series(len_uniques, index=list(cat_X.columns)) print("uniques_series: %s" % uniques_series) ALPHA = 75 MAX_UNIQUE = max(len_uniques) # FEATURES_COUNT = cat_X.shape[1] cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.params['random_state']) split_cv = [cv] # split_cv = [3, 3] from target_encoding import TargetEncoder transformers.append( (TargetEncoder(alpha=ALPHA, max_unique=MAX_UNIQUE, split_in=split_cv), categorical_features)) if self._use_ohe_encoding and any(categorical_features.values): transformers.append( (OneHotEncoder(handle_unknown='ignore', sparse=True), categorical_features)) assert len(transformers) > 0, "should have some features" preprocess = make_column_transformer(*transformers) # ESTIMATOR lr_defaults = dict(penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) allowed_lr_kwargs_keys = lr_defaults.keys() lr_params_copy = copy.deepcopy(lr_params) for k, v in lr_params_copy.items(): if k not in allowed_lr_kwargs_keys: lr_params.pop(k, None) del lr_params_copy can_score = self.num_classes == 2 and 'AUC' in self.params_base[ 'score_f_name'].upper() # print("LR: can_score: %s" % str(can_score)) if can_score: scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True) else: scorer = None if not ('C' in lr_params or 'l1_ratios' in lr_params): # override self.params['cv_search'] = False if not self.params['cv_search']: estimator = LogisticRegression(**lr_params) estimator_name = 'logisticregression' else: lr_params_cv = copy.deepcopy(lr_params) if 'C' in lr_params: lr_params_cv['Cs'] = self.get_param_range( self.params['C'], self.params['fit_count'], func_type='log') # print("LR: CV: Cs: %s" % str(lr_params_cv['Cs'])) if 'l1_ratios' in lr_params: lr_params_cv['l1_ratios'] = self.get_param_range( self.params['l1_ratio'], self.params['fit_count'], func_type='linear') # print("LR: CV: l1_ratios: %s" % str(lr_params_cv['l1_ratios'])) lr_params_cv.pop('n_jobs', None) lr_params_cv.pop('C', None) lr_params_cv.pop('l1_ratio', None) if lr_params_cv['penalty'] == 'none': lr_params_cv['penalty'] = 'l2' estimator = LogisticRegressionCV(n_jobs=self.params['n_jobs'], cv=3, refit=True, scoring=scorer, **lr_params_cv) estimator_name = 'logisticregressioncv' # PIPELINE model = make_pipeline(preprocess, estimator) # FIT if self.params['grid_search_iterations'] and can_score: # WIP FIXME for multiclass and other scorers from sklearn.model_selection import GridSearchCV max_iter_range = self.get_param_range( self.params['max_iter'], self.params['fit_count'], range_limit=self._overfit_limit_iteration_step, func_type='log') # print("LR: max_iter_range: %s" % str(max_iter_range)) param_grid = { '%s__max_iter' % estimator_name: max_iter_range, } grid_clf = GridSearchCV(model, param_grid, n_jobs=self.params['n_jobs'], cv=3, iid=True, refit=True, scoring=scorer) grid_clf.fit(X, y) model = grid_clf.best_estimator_ # print("LR: best_index=%d best_score: %g best_params: %s" % ( # grid_clf.best_index_, grid_clf.best_score_, str(grid_clf.best_params_))) elif grid_search: # WIP from sklearn.model_selection import GridSearchCV param_grid = { 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'], '%s__C' % estimator_name: [0.1, 0.5, 1.0], } grid_clf = GridSearchCV(model, param_grid, cv=10, iid=False) grid_clf.fit(X, y) model = grid_clf.best_estimator_ # self.best_params = grid_clf.best_params_ else: model.fit(X, y) # get actual LR model lr_model = model.named_steps[estimator_name] if self._debug and False: import uuid struuid = str(uuid.uuid4()) save_obj( model.named_steps['columntransformer'].fit_transform(X, y), "columns_csr_%s.pkl" % struuid) # average importances over classes importances = np.average(np.array(lr_model.coef_), axis=0) # average iterations over classes (can't take max_iter per class) iterations = np.average(lr_model.n_iter_) # print("LR: iterations: %d" % iterations) # reduce OHE features to original names ohe_features_short = [] if self._use_ohe_encoding and any(categorical_features.values): if self._use_ohe_encoding: input_features = [x + self._ohe_postfix for x in cat_X.columns] ohe_features = pd.Series( model.named_steps['columntransformer']. named_transformers_['onehotencoder'].get_feature_names( input_features=input_features)) def f(x): return '_'.join(x.split(self._ohe_postfix + '_')[:-1]) # identify OHE features ohe_features_short = ohe_features.apply(lambda x: f(x)) full_features_list.extend(list(ohe_features_short)) # aggregate our own features if self._kaggle_features: self.features.aggregate(full_features_list, importances) msg = "LR: num=%d cat=%d : ohe=%d : imp=%d full=%d" % ( len(num_X.columns), len(cat_X.columns), len(ohe_features_short), len(importances), len(full_features_list)) if self._debug: print(msg) assert len(importances) == len(full_features_list), msg # aggregate importances by dai feature name importances = pd.Series( np.abs(importances), index=full_features_list).groupby(level=0).mean() assert len(importances) == len( X_orig_cols_names), "%d %d %s : %s %s" % ( len(importances), len(X_orig_cols_names), msg, str(list(X.columns)), str(list(X.dtypes))) # save hyper parameter searched results for next search self.params['max_iter'] = iterations if self.params['cv_search']: self.params['C'] = np.average(lr_model.C_, axis=0) if 'l1_ratios' in lr_params and self.params['cv_search']: self.params['l1_ratio'] = np.average(lr_model.l1_ratio_, axis=0) if 'fit_count' in self.params: self.params['fit_count'] += 1 else: self.params['fit_count'] = 0 self.set_model_properties(model=(model, self.features), features=orig_cols, importances=importances.tolist(), iterations=iterations) self.features = None
def main(): train_pitch = pd.read_csv(TRAIN_PITCH_PATH) train_player = pd.read_csv(TRAIN_PLAYER_PATH) test_pitch = pd.read_csv(TEST_PITCH_PATH) test_player = pd.read_csv(TEST_PLAYER_PATH) train_pitch["use"] = "train" test_pitch["use"] = "test" test_pitch["投球位置区域"] = 0 pitch_data = pd.concat([train_pitch, test_pitch], axis=0).drop(PITCH_REMOVAL_COLUMNS, axis=1) player_data = pd.concat([train_player, test_player], axis=0).drop(PLAYER_REMOVAL_COLUMNS, axis=1) #.fillna(0) pitchers_data = train_player[train_player["位置"] == "投手"].drop( PLAYER_REMOVAL_COLUMNS, axis=1) merged = pd.merge( pitch_data, player_data, how="left", left_on=['年度', '投手ID'], right_on=['年度', '選手ID'], ).drop(['選手ID', '球種'], axis=1).fillna(0) merged = merged.rename(columns={"選手名": "投手名", "チーム名": "投手チーム名"}) use = merged.loc[:, "use"] label = merged.loc[:, "投球位置区域"] merged = merged.drop(["use", "投球位置区域", "位置", "年度", "投手名"], axis=1) # category_encodersによってカテゴリ変数をencordingする categorical_columns = [ c for c in merged.columns if merged[c].dtype == 'object' ] ce_oe = ce.OrdinalEncoder(cols=categorical_columns, handle_unknown='impute') encorded_data = ce_oe.fit_transform(merged) encorded_data = cf.standardize(encorded_data) encorded_data = pd.concat([encorded_data, use, label], axis=1) train = encorded_data[encorded_data["use"] == "train"].drop( "use", axis=1).reset_index(drop=True) test = encorded_data[encorded_data["use"] == "test"].drop( "use", axis=1).reset_index(drop=True) train_x = train.drop("投球位置区域", axis=1) train_y = train.loc[:, "投球位置区域"].astype(int) test_x = test.drop("投球位置区域", axis=1).reset_index(drop=True) # f = partial(objective, train_x, train_y) # 目的関数に引数を固定しておく # study = optuna.create_study(direction='maximize') # Optuna で取り出す特徴量の数を最適化する # study.optimize(f, n_trials=10) # 試行回数を決定する # print('params:', study.best_params)# 発見したパラメータを出力する # best_feature_count = study.best_params['n_components'] # train_x_pca, test_x_pca = get_important_features(train_x, test_x, best_feature_count) num_class = 13 # best_params = get_best_params(train_x_pca, train_y, num_class) # 最適ハイパーパラメータの探索 n_splits = 5 for depth, num in zip(DEPTH_NUMS, range(13, 18)): submission = np.zeros((len(test_x), num_class)) print("################################") print(f"start {depth} depth !!") print("################################") skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0) for i, (tr_idx, val_idx) in enumerate(skf.split(train_x, train_y)): tr_x = train_x.iloc[tr_idx].reset_index(drop=True) tr_y = train_y.iloc[tr_idx].reset_index(drop=True) model = get_rf_model(tr_x, tr_y, depth) y_preda = model.predict_proba(test_x) submission += y_preda submission_df = pd.DataFrame(submission) / n_splits submission_df.to_csv(f"{DATA_DIR}/submission_pitching_course{num}.csv", header=False) print("#################################") print(submission_df) print("#################################")
def read_data(input_data_dir='../../data/', output_dir='./'): train_data = pd.read_csv(f'{input_data_dir}/sales_train_evaluation.csv') sell_prices = pd.read_csv(f'{input_data_dir}/sell_prices.csv') calendar = pd.read_csv(f'{input_data_dir}/calendar.csv') # ---- process calendar features ---- # print('* Processing calendar features') calendar.date = pd.to_datetime(calendar.date) calendar['relative_year'] = 2016 - calendar.year # convert month, day and weekday to cyclic encodings calendar['month_sin'] = np.sin(2 * np.pi * calendar.month / 12.0) calendar['month_cos'] = np.cos(2 * np.pi * calendar.month / 12.0) calendar['day_sin'] = np.sin(2 * np.pi * calendar.date.dt.day / calendar.date.dt.days_in_month) calendar['day_cos'] = np.cos(2 * np.pi * calendar.date.dt.day / calendar.date.dt.days_in_month) calendar['weekday_sin'] = np.sin(2 * np.pi * calendar.wday / 7.0) calendar['weekday_cos'] = np.cos(2 * np.pi * calendar.wday / 7.0) # use same encoded labels for both the event name columns cal_label = ['event_name_1', 'event_name_2'] cal_label_encoded_cols = ['event_name_1_enc', 'event_name_2_enc'] calendar[cal_label_encoded_cols] = calendar[cal_label] cal_label_encoder = ce.OrdinalEncoder(cols=cal_label_encoded_cols) cal_label_encoder.fit(calendar) cal_label_encoder.mapping[1]['mapping'] = cal_label_encoder.mapping[0][ 'mapping'] calendar = cal_label_encoder.transform(calendar) # subtract one from label encoded as pytorch uses 0-indexing for col in cal_label_encoded_cols: calendar[col] = calendar[col] - 1 calendar_df = calendar[[ 'wm_yr_wk', 'd', 'snap_CA', 'snap_TX', 'snap_WI', 'relative_year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin', 'weekday_cos' ] + cal_label_encoded_cols] # ---- Merge all dfs, keep calender_df features separate and just concat them for each batch ---- # train_data.id = train_data.id.str[:-11] sell_prices['id'] = sell_prices['item_id'] + '_' + sell_prices['store_id'] # add empty columns for future data train_data = pd.concat([ train_data, pd.DataFrame(columns=['d_' + str(i) for i in range(1942, 1970)]) ]) # Encode categorical features using either one-hot or label encoding (for embeddings) print('* Encoding categorical features') label = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] label_encoded_cols = [str(i) + '_enc' for i in label] train_data[label_encoded_cols] = train_data[label] label_encoder = ce.OrdinalEncoder(cols=[str(i) + '_enc' for i in label]) label_encoder.fit(train_data) train_data = label_encoder.transform(train_data) # subtract one from label encoded as pytorch uses 0-indexing for col in label_encoded_cols: train_data[col] = train_data[col] - 1 # Reshape, change dtypes and add previous day sales print('* Add previous day sales and merge sell prices') data_df = pd.melt(train_data, id_vars=[ 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'item_id_enc', 'dept_id_enc', 'cat_id_enc', 'store_id_enc', 'state_id_enc' ], var_name='d', value_vars=['d_' + str(i) for i in range(1, 1970)], value_name='sales') # change dtypes to reduce memory usage data_df[['sales']] = data_df[['sales']].fillna(-2).astype( np.int16) # fill future sales as -2 calendar_df[['snap_CA', 'snap_TX', 'snap_WI', 'relative_year']] = calendar_df[[ 'snap_CA', 'snap_TX', 'snap_WI', 'relative_year' ]].astype(np.int8) calendar_df[cal_label_encoded_cols] = calendar_df[ cal_label_encoded_cols].astype(np.int16) data_df[label_encoded_cols] = data_df[label_encoded_cols].astype(np.int16) # merge sell prices data_df = data_df.merge(right=calendar_df[['d', 'wm_yr_wk']], on=['d'], how='left') data_df = data_df.merge(right=sell_prices[['id', 'wm_yr_wk', 'sell_price']], on=['id', 'wm_yr_wk'], how='left') data_df.sell_price = data_df.sell_price.fillna(0.0) data_df['prev_day_sales'] = data_df.groupby(['id'])['sales'].shift(1) # remove data for d_1 data_df.dropna(axis=0, inplace=True) calendar_df = calendar_df[calendar_df.d != 'd_1'] # change dtypes data_df[['prev_day_sales']] = data_df[['prev_day_sales']].astype(np.int16) # ---- Add previous day totals of aggregated series as features ---- # # print('* Add previous day totals of aggregated series as features') # # total # data_df = data_df.merge(right= # data_df.groupby(['d'])[['prev_day_sales']].sum().astype( # np.int32).add_suffix('_all').reset_index(), # on=['d'], how='left') # # category level # data_df = data_df.merge(right=data_df.groupby(['d', 'cat_id'])[['prev_day_sales']].sum().astype( # np.int32).reset_index().pivot( # index='d', columns='cat_id', values='prev_day_sales').add_prefix('prev_d_cat_'), # on=['d'], how='left') # # state level # data_df = data_df.merge(right= # data_df.groupby(['d', 'state_id'])[['prev_day_sales']].sum().astype( # np.int32).reset_index().pivot( # index='d', columns='state_id', values='prev_day_sales').add_prefix('prev_d_state_'), # on=['d'], how='left') # # store level # data_df = data_df.merge(right= # data_df.groupby(['d', 'store_id'])[['prev_day_sales']].sum().astype( # np.int32).reset_index().pivot( # index='d', columns='store_id', values='prev_day_sales').add_prefix('prev_d_store_'), # on=['d'], how='left') # # department level # data_df = data_df.merge(right= # data_df.groupby(['d', 'dept_id'])[['prev_day_sales']].sum().astype( # np.int32).reset_index().pivot( # index='d', columns='dept_id', values='prev_day_sales').add_prefix('prev_d_dept_'), # on=['d'], how='left') # remove category columns del data_df['wm_yr_wk'] del data_df['item_id'] del data_df['dept_id'] del data_df['cat_id'] del data_df['store_id'] del data_df['state_id'] num_samples = data_df.id.nunique() num_timesteps = data_df.d.nunique() data_df = data_df.set_index(['id', 'd']) ids = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] enc_dec_feats = ['sell_price'] + label_encoded_cols enc_only_feats = data_df.columns.difference( ['sales', 'sell_price', 'prev_day_sales'] + enc_dec_feats) sales_data_ids = train_data[ids].values Y = data_df.sales.values.reshape(num_timesteps, num_samples).T X_enc_only_feats = np.array(data_df[enc_only_feats]).reshape( num_timesteps, num_samples, -1) X_enc_dec_feats = np.array(data_df[enc_dec_feats]).reshape( num_timesteps, num_samples, -1) X_prev_day_sales = data_df.prev_day_sales.values.reshape( num_timesteps, num_samples) calendar_index = calendar_df.d X_calendar = np.array(calendar_df.iloc[:, 2:]) X_calendar_cols = list(calendar_df.columns[2:]) # # for prev_day_sales and sales (y), set value as -1 for the period the product was not actively sold # for idx, first_non_zero_idx in enumerate((X_prev_day_sales != 0).argmax(axis=0)): # X_prev_day_sales[:first_non_zero_idx, idx] = -1 # for idx, first_non_zero_idx in enumerate((Y != 0).argmax(axis=1)): # Y[idx, :first_non_zero_idx] = -1 # ---- Save processed data ---- # print('* Save processed data') data_dict = { 'sales_data_ids': sales_data_ids, 'calendar_index': calendar_index, 'X_prev_day_sales': X_prev_day_sales, 'X_enc_only_feats': X_enc_only_feats, 'X_enc_dec_feats': X_enc_dec_feats, 'enc_dec_feat_names': enc_dec_feats, 'enc_only_feat_names': enc_only_feats, 'X_calendar': X_calendar, 'X_calendar_cols': X_calendar_cols, 'Y': Y, 'cal_label_encoder': cal_label_encoder, 'label_encoder': label_encoder } # pickle data with open(f'{output_dir}/data.pickle', 'wb') as f: pkl.dump(data_dict, f, protocol=pkl.HIGHEST_PROTOCOL)
# n_estimators = 1000, min_samples_leaf = 2 pipeline = Pipeline([('StandardScaler', _ss), ('PCA', _pca), ('RandomForestClassifier', _rfc)]) searchCV = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=5, cv=3, scoring='accuracy', verbose=10, return_train_score=True, n_jobs=-1) target_encoder = ce.OrdinalEncoder() train_target_encoded = target_encoder.fit_transform(train_target) train_target_encoded searchCV.fit(train_features, train_target_encoded) #%% print('Cross-validation accuracy', searchCV.best_score_) print('Best hyperparameters', searchCV.best_params_) #%% out = test_features[['id']].copy() #%% train_features.shape
def fit_(self, df: pd.DataFrame, columns: list, target: str): self.encoder = ce.OrdinalEncoder( cols=columns, handle_unknown="value", handle_missing="value" ) self.encoder.fit(df.loc[:, columns])
df['checkin_day'] = checkin.dt.day df['checkin_month'] = checkin.dt.month df['checkin_year'] = checkin.dt.year checkout = pd.to_datetime(df['booking_check_out']) df['checkout_day'] = checkout.dt.day df['checkout_month'] = checkout.dt.month df['checkout_year'] = checkout.dt.year dates = df[[ 'checkin_day', 'checkin_month', 'checkin_year', 'checkout_day', 'checkout_month', 'checkout_year' ]] #checkout = df['booking_check_out'].dt.date # encode IDs in ordinal ids = ce.OrdinalEncoder( cols=['listing_id', 'unit_id', 'property_id', 'area_id']) ids = ids.fit_transform(df) ids = ids[['listing_id', 'unit_id', 'property_id', 'area_id']] # encode property pType = pd.get_dummies(df.property_type, prefix="type") pDesign = pd.get_dummies(df.property_design, prefix='design') #encode earnings earnings = df['usd'] # concatinate all df cproperty = pd.concat([pType, pDesign], axis='columns') dummies = pd.concat([dates, ids, cproperty, earnings], axis='columns') #print(dummies)
] ] doPermutationTests(X, y, features, 'sum') encoder = ce.LeaveOneOutEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) results.append(doAccuracyTests(X, y, 'leaveoneout')) features = ['diameter', 'color'] doPermutationTests(X, y, features, 'leaveoneout') encoder = ce.TargetEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) results.append(doAccuracyTests(X, y, 'target')) features = ['diameter', 'color'] doPermutationTests(X, y, features, 'target') encoder = ce.OrdinalEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) results.append(doAccuracyTests(X, y, 'ordinal')) features = ['diameter', 'color'] doPermutationTests(X, y, features, 'ordinal') encoder = ce.WOEEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) results.append(doAccuracyTests(X, y, 'woe')) features = ['diameter', 'color'] doPermutationTests(X, y, features, 'woe') df = pd.DataFrame(results, columns=['encoding', 'knn', 'rfc', 'gnb']) df.to_csv('./acc/p10_cv50.csv')
def _fit_ordinal(self, df, target): ordinal_encoder = ce.OrdinalEncoder() ordinal_encoder.fit(df[target].map(to_str)) name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_ordinal' for x in ordinal_encoder.get_feature_names()] self.trans_ls.append(('ordinal', name, target, ordinal_encoder))
# Logging for Visual Comparison log_cols=["Classifier", "Accuracy","Recall Score","F1 Score","Precision Score"] log = pd.DataFrame(columns=log_cols) def map_for_emb(emb): return {'C':1,'S':2,'Q':3}.get(emb,10) data=pd.read_csv('/data.csv') data.head() labelencoder = LabelEncoder() data['Gender_cat'] = labelencoder.fit_transform(data['Gender']) data=data.drop(['Gender'],axis=1) data.rename(columns={'Gender_cat':'Gender'},inplace=True) data.head() y=data['Survived'].values data=data.drop(['PassengerId','Survived'],axis=1) enc=ce.OrdinalEncoder(cols=['Embarked'],return_df=True) data=enc.fit_transform(data) X=data.values X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=0) print(data['PClass'].unique()) print(data['Sibling'].unique()) print(data['Embarked'].unique()) print(data['Gender'].unique()) X y accuracies=[] models=[]
def preprocessing(df): """ Preprocesses the data. Input: DataFrame Output: X_train, X_test, y_train, y_test """ # Copying DF dfx = df.copy() ## EDA # Dropping Columns dfx.drop(columns=["host_name", "last_review", "reviews_per_month"], inplace=True) # Removing -- Custom Outliers dfx = dfx[(dfx["price"] > 0) & (dfx["price"] < 10000)] # New Column -- 'log_price' dfx["log_price"] = np.log(dfx["price"].values) # Target and Features target = "log_price" features = [ "neighbourhood_group", "neighbourhood", "latitude", "longitude", "room_type", "minimum_nights", "number_of_reviews", "calculated_host_listings_count", "availability_365" ] # X Features Matrix X = dfx[features] # y target vector y = dfx[target] # Mapping - 'room_type' room_type_dict = { "Shared room": 1, "Private room": 2, "Entire home/apt": 3 } X.iloc[:, 4].map(room_type_dict) # print(X["room_type"]) # Train Test Split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42) # Preprocess Pipeline -- OrdinalEncoder and StandardScaler preprocess = make_pipeline(ce.OrdinalEncoder(), StandardScaler()) # Fit Transform and Transform Training and Testing Data X_train = preprocess.fit_transform(X_train) X_test = preprocess.transform(X_test) # Create DataFrame for X Matrices X_train_df = pd.DataFrame(X_train, columns=features) X_test_df = pd.DataFrame(X_test, columns=features) print(X_train_df.shape, X_test_df.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape) # Return: X_test_df, X_train, X_test, y_train, y_test return X_train_df, X_test_df, X_train, X_test, y_train, y_test
y_trainval, test_size=0.05, train_size=0.10, stratify=y_trainval, random_state=42, ) train_id = X_train["id"] val_id = X_val["id"] test_id = X_test["id"] X_train = X_train.drop("id", axis=1) X_val = X_val.drop("id", axis=1) X_test = X_test.drop("id", axis=1) x_processor = make_pipeline(ce.OrdinalEncoder(), SimpleImputer(strategy="median")) y_processor = make_pipeline(ce.OrdinalEncoder(), SimpleImputer(strategy="median")) cols = X_train.columns len(cols) def prepare_inputs(X_train, X_val, X_test): X_train_enc = pd.DataFrame(x_processor.fit_transform(X_train), columns=cols) X_val_enc = pd.DataFrame(x_processor.transform(X_val), columns=cols) X_test_enc = pd.DataFrame(x_processor.transform(X_test), columns=cols) return X_train_enc, X_val_enc, X_test_enc
train['K'] = train['FloorPlan'].map(lambda x: 1 if 'K' in str(x) else 0) train['S'] = train['FloorPlan'].map(lambda x: 1 if 'S' in str(x) else 0) train['R'] = train['FloorPlan'].map(lambda x: 1 if 'R' in str(x) else 0) train['Maisonette'] = train['FloorPlan'].map(lambda x: 1 if 'メゾネット' in str(x) else 0) train['OpenFloor'] = train['FloorPlan'].map(lambda x: 1 if 'オープンフロア' in str(x) else 0) train['Studio'] = train['FloorPlan'].map(lambda x: 1 if 'スタジオ' in str(x) else 0) Label_Enc_list = [ 'Type', 'NearestStation', 'FloorPlan', 'CityPlanning', 'Structure', 'Direction', 'Classification', 'Municipality', 'Region', 'Remarks', 'Renovation' ] ce_oe = ce.OrdinalEncoder(cols=Label_Enc_list, handle_unknown='impute') # 文字を序数に変換 train = ce_oe.fit_transform(train) # 値を1の始まりから0の始まりにする for i in Label_Enc_list: train[i] = train[i] - 1 # intに変換 for i in Label_Enc_list: train[i] = train[i].astype("int") #------------------------ # 予測モデルの作成、学習 #------------------------ # 目的変数と説明変数を代入 X = train[[ 'TimeToNearestStation', 'FloorAreaRatio', 'CityPlanning', 'BuildingAD',
def LabelEncoding(self, data, column): encoder = ce.OrdinalEncoder(cols=[column], return_df=True) return encoder.fit_transform(data)
def test_numbers_as_strings_with_numpy_output(self): # see issue #229 X = np.array(['11', '12', '13', '14', '15']) oe = encoders.OrdinalEncoder(return_df=False) oe.fit(X)
importances1 = pd.Series(rf.feature_importances_, encoded.columns) # Plot feature importances n = 20 plt.figure(figsize=(10,n/2)) plt.title(f'Top {n} features') importances1.sort_values()[-n:].plot.barh(color='grey'); # Commented out IPython magic to ensure Python compatibility. # Generate validation curves # %matplotlib inline import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import validation_curve from sklearn.tree import DecisionTreeClassifier pipeline = make_pipeline( ce.OrdinalEncoder(), SimpleImputer(), DecisionTreeClassifier() ) depth = range(1, 10, 2) train_scores, val_scores = validation_curve( pipeline, X_train, y_train, param_name='decisiontreeclassifier__max_depth', param_range=depth, scoring='accuracy', cv=3, n_jobs=-1 ) plt.figure(dpi=150) plt.plot(depth, np.mean(train_scores, axis=1), color='blue', label='training error')
features = dataset.drop(dataset.columns[-1], axis=1) target = dataset.iloc[:, -1] import warnings warnings.filterwarnings("ignore") """START: Import encoders""" import category_encoders as ce import sys sys.path.append('../encoders/') from ceng import CENGEncoder from cesamo import CESAMOEncoder from entity_embedding import EntityEmbeddingEncoder from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder Encoders = { 'Ordinal': ce.OrdinalEncoder(), 'Polynomial': ce.PolynomialEncoder(), 'OneHot': ce.OneHotEncoder(), 'BackwardDifference': ce.BackwardDifferenceEncoder(), 'Helmert': ce.HelmertEncoder(), 'EntityEmbedding': EntityEmbeddingEncoder(), 'TargetEnc': ce.TargetEncoder(), 'WOE': ce.WOEEncoder(), 'CENG': CENGEncoder(verbose=0), 'GeneticPP': GeneticPPEncoder(), 'AgingPP': AgingPPEncoder(), 'SimplePP': SimplePPEncoder(), 'CESAMOEncoder': CESAMOEncoder() } """END: Import encoders""" """START: Import models"""
data_set = pd.read_csv("data.csv", sep=",", header=0) #data_set.head(3) # In[3]: #time をone hotラベルに import category_encoders as ce # Eoncodeしたい列をリストで指定。もちろん複数指定可能。 list_cols = ['time'] # OneHotEncodeしたい列を指定。Nullや不明の場合の補完方法も指定。 #ce_ohe = ce.OneHotEncoder(cols=list_cols,handle_unknown='impute') ce_oe = ce.OrdinalEncoder(cols=list_cols, handle_unknown='impute') # pd.DataFrameをそのまま突っ込む df_session_ce_ordinal = ce_oe.fit_transform(data_set) #df_session_ce_ordinal.head(350) # In[4]: #print(df_session_ce_ordinal.columns.values) # In[28]: # データの分割 (train, test) = train_test_split(df_session_ce_ordinal, test_size=0.2,
df['date_recorded'] = pandas.to_datetime(df['date_recorded']).dt.year df['date_recorded'] = df['date_recorded'].astype('int32') df['construction_year'] = df['construction_year'].astype('int32') df['construction_year'] = df['construction_year'].replace(0, np.nan) df = df.dropna(subset=['construction_year']) df['date_recorded'] = df['date_recorded'] - df['construction_year'] # drop redundant features df.drop(df.columns[[ 0, 8, 9, 11, 12, 13, 14, 15, 16, 19, 21, 23, 25, 26, 28, 30, 34, 36, 37, 39 ]], axis=1, inplace=True) # transform categorical variables to numeric encoder = ce.OrdinalEncoder(cols=['status_group']) df = encoder.fit_transform(df) df = df.apply(pandas.to_numeric, errors='ignore') encoder = ce.BinaryEncoder() df = encoder.fit_transform(df) df = df.replace([np.inf, -np.inf], np.nan) df = df.dropna() # Store contents of df into an array array = df.values X = array[:, 0:67] Y = array[:, 68] # run Ada Boost algorithm seed = 7 k = 10
'OneHot': ce.OneHotEncoder(cols=ordinal_features), 'Ordinal': ce.OrdinalEncoder(mapping=[ {'col': 'ExterQual', 'mapping': ordinal_mapping_1}, {'col': 'ExterCond', 'mapping': ordinal_mapping_1}, {'col': 'BsmtQual', 'mapping': ordinal_mapping_1}, {'col': 'BsmtCond', 'mapping': ordinal_mapping_1}, {'col': 'BsmtExposure', 'mapping': ordinal_mapping_2}, {'col': 'BsmtFinType1', 'mapping': ordinal_mapping_3}, {'col': 'BsmtFinType2', 'mapping': ordinal_mapping_3}, {'col': 'HeatingQC', 'mapping': ordinal_mapping_1}, {'col': 'KitchenQual', 'mapping': ordinal_mapping_1}, {'col': 'FireplaceQu', 'mapping': ordinal_mapping_1}, {'col': 'GarageQual', 'mapping': ordinal_mapping_1}, {'col': 'GarageCond', 'mapping': ordinal_mapping_1}, {'col': 'PoolQC', 'mapping': ordinal_mapping_1}, {'col': 'Fence', 'mapping': ordinal_mapping_4}], cols=ordinal_features), 'Binary Ordinal': ce.OrdinalEncoder(mapping=[
def ordinal_encode(data): encoding_data = data.copy() encoder = ce.OrdinalEncoder(encoding_data) data_encoded = encoder.fit_transform(encoding_data) return (data_encoded)
ImputedX=imputer.fit_transform(X) # Convert output to a data frame to show the stats imputed_df = pd.DataFrame.from_records(ImputedX) imputed_df.columns = features imputed_df['Country'] = swine_data['Country'] imputed_df['Cases'] = swine_data['Cases'] imputed_df['Update Time'] = swine_data['Update Time'] # print('---------------------------------------') missing_0_values_count = imputed_df.isnull().sum() # print(missing_0_values_count) # Categorical Encoders import category_encoders as ce enc = ce.OrdinalEncoder(cols=["Country","Update Time"],handle_missing='return_nan',return_df= True) #We now fit the model and transform the data and put it in X which is a dataframe X=enc.fit_transform(imputed_df) # Outlier Detection from sklearn.neighbors import LocalOutlierFactor clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) y_pred = clf.fit_predict(X) totalOutliers=0 for pred in y_pred: if pred == -1: totalOutliers=totalOutliers+1 print ("Number of predicted outliers:",totalOutliers)
'vote.arff', 'vowel.arff' ] # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.GaussEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.LogOddsRatioEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder() ] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv') # Loop over datasets, then over encoders, and finally, over the models for dataset_name in datasets: X, y, fold_count = arff_loader.load(dataset_name) non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values) for encoder in encoders:
def run_a_experiments(): print("Loading Data") df = load_data() #columns: continuous = [ 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ] categorical = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country' ] X = df[continuous + categorical] y = df[['class']] successes = y.sum()[0] alpha_prior = float(successes / len(y)) models = [ LogisticRegression(solver='lbfgs'), RandomForestClassifier(n_estimators=100), GradientBoostingClassifier(), MLPClassifier() ] #models = [RandomForestClassifier()] results = [[ 'model', 'Encoder', 'Accuracy', 'STD', 'Training Time', 'Sparsity', 'Dimensions' ]] for model in models: print("") print("----------------------") print("Testing Algorithm: ") print(type(model)) print("----------------------") #TargetEncoder print("TargetEncoder Results:") acc, std, time, sparsity, dimensions = cv_binary_classification( model, X, y, continuous, categorical, encoder=ce.TargetEncoder(return_df=False)) results.append([ type(model), 'TargetEncoder', acc, std, time, sparsity, dimensions ]) #OrdinalEncoder print("OrdinalEncoder Results:") acc, std, time, sparsity, dimensions = cv_binary_classification( model, X, y, continuous, categorical, encoder=ce.OrdinalEncoder(return_df=False)) results.append([ type(model), 'OrdinalEncoder', acc, std, time, sparsity, dimensions ]) #BinaryEncoder print("BinaryEncoder Results:") acc, std, time, sparsity, dimensions = cv_binary_classification( model, X, y, continuous, categorical, encoder=ce.BinaryEncoder(return_df=False)) results.append([ type(model), 'BinaryEncoder', acc, std, time, sparsity, dimensions ]) #HashingEncoder print("HashingEncoder Results:") acc, std, time, sparsity, dimensions = cv_binary_classification( model, X, y, continuous, categorical, encoder=ce.HashingEncoder(return_df=False)) results.append([ type(model), 'HashingEncoder', acc, std, time, sparsity, dimensions ]) #OneHotEncoder print("OneHotEncoder Results:") acc, std, time, sparsity, dimensions = cv_binary_classification( model, X, y, continuous, categorical, encoder=OneHotEncoder(handle_unknown='ignore', sparse=False)) results.append([ type(model), 'OneHotEncoder', acc, std, time, sparsity, dimensions ]) #BetaEncoder (mean) print("Beta Encoder (mean) Results:") acc, std, time, sparsity, dimensions = cv_binary_classification( model, X, y, continuous, categorical, encoder=BetaEncoder(alpha=alpha_prior, beta=1 - alpha_prior)) results.append([ type(model), 'BetaEncoder (m)', acc, std, time, sparsity, dimensions ]) #BetaEncoder (mean, variance) print("Beta Encoder (mean and variance Results:") acc, std, time, sparsity, dimensions = cv_binary_classification( model, X, y, continuous, categorical, encoder=BetaEncoder(alpha=alpha_prior, beta=1 - alpha_prior), moments='mv') results.append([ type(model), 'BetaEncoder (mv)', acc, std, time, sparsity, dimensions ]) file = 'adult_experiments.csv' with open(file, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(results) try: upload_file(file) except: print("File Not Uploaded")
# In[21]: target_encoder.fit(df, y=df.loan_status) # In[22]: encoded_df = target_encoder.transform(df) # In[23]: encoded_df.head() # In[24]: ordinal_encoder = ce.OrdinalEncoder(cols=['term']) ordinal_encoder.fit(encoded_df) encoded_df = ordinal_encoder.transform(encoded_df) # In[25]: encoded_df.shape # In[26]: encoded_df.head() # In[27]: encoded_df.to_csv('../Processed Data/df_processed_categorical_v3.csv', index=False)
y = df['dep_delayed_15min'] return X, y X, y = split_df(train) print('Train Partition') X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2) print('Building pipeline') numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ordinal', ce.OrdinalEncoder())]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, ['Distance']), ('cat', categorical_transformer, ['UniqueCarrier', 'Origin', 'Dest', 'Day_of_Week','year', 'month', 'day', 'hour', 'minutes'])]) rf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier())]) param_grid = {'classifier__n_estimators': [400]} print('Running Model') CV = GridSearchCV(rf, param_grid, n_jobs= -1,scoring='roc_auc') CV.fit(X_train, y_train) #print(CV.get_params())
def test_ordinal(self): """ :return: """ cols = ['C1', 'D', 'E', 'F'] X = self.create_dataset(n_rows=1000) X_t = self.create_dataset(n_rows=100) X_t_extra = self.create_dataset(n_rows=100, extras=True) enc = encoders.OrdinalEncoder(verbose=1, cols=cols) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.OrdinalEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.OrdinalEncoder(verbose=1, drop_invariant=True) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.OrdinalEncoder(verbose=1, return_df=False) enc.fit(X, None) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray)) enc = encoders.OrdinalEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='impute') enc.fit(X, None) out = enc.transform(X_t_extra) self.assertEqual(len(set(out['D'].values)), 4) self.assertIn(0, set(out['D'].values)) self.assertFalse(enc.mapping is None) self.assertTrue(len(enc.mapping) > 0) enc = encoders.OrdinalEncoder(verbose=1, mapping=enc.mapping, return_df=True, impute_missing=True, handle_unknown='impute') enc.fit(X, None) out = enc.transform(X_t_extra) self.assertEqual(len(set(out['D'].values)), 4) self.assertIn(0, set(out['D'].values)) self.assertTrue(len(enc.mapping) > 0) enc = encoders.OrdinalEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='ignore') enc.fit(X, None) out = enc.transform(X_t_extra) out_cats = [x for x in set(out['D'].values) if np.isfinite(x)] self.assertEqual(len(out_cats), 3) self.assertFalse(enc.mapping is None) enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='error') enc.fit(X, None) with self.assertRaises(ValueError): out = enc.transform(X_t_extra) # test inverse_transform X = self.create_dataset(n_rows=1000, has_none=False) X_t = self.create_dataset(n_rows=100, has_none=False) X_t_extra = self.create_dataset(n_rows=100, extras=True, has_none=False) enc = encoders.OrdinalEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) self.verify_inverse_transform( X_t, enc.inverse_transform(enc.transform(X_t))) with self.assertRaises(ValueError): out = enc.inverse_transform(enc.transform(X_t_extra))
happiness_sep_df = happiness_sep_df[happiness_sep_df['B.2.2'] != 'N.C.'] happiness_sep_df = happiness_sep_df[happiness_sep_df['B.2.2'] != 'N.S.'] happiness_sep_df = happiness_sep_df.dropna() print(happiness_sep_df['B.2.1'].sort_values(ascending=True).unique()) print(happiness_sep_df['B.2.2'].sort_values(ascending=True).unique()) # # data munging :: encode ordinal features with category_encoder.OrdinalEncoder): import category_encoders as ce ordinals = pd.DataFrame({ 'situacion_actual': ['Muy Buena', 'Buena', 'Regular', 'Mala', 'Muy mala'], 'valor': [2, 0, 1, 3, 4] }) XB21 = ordinals.drop('valor', axis=1) yB21 = ordinals.drop('situacion_actual', axis=1) ce_ordB21 = ce.OrdinalEncoder(cols=['situacion_actual']) ce_ordB21 = ce_ordB21.fit_transform(XB21, yB21['valor']) print(ce_ordB21) ceX21 = ce.OrdinalEncoder(happiness_sep_df['B.2.1']) ceX21 = ceX21.fit_transform(happiness_sep_df['B.2.1'], yB21['valor']) happiness_sep_df[ 'B.2.1_valor'] = ceX21 # seems bigger numbers for worse qualitative data ... ? ceX22, yB22 = ce.OrdinalEncoder(happiness_sep_df['B.2.2']), yB21 ceX22 = ceX22.fit_transform(happiness_sep_df['B.2.2'], yB21['valor']) happiness_sep_df[ 'B.2.2_valor'] = ceX22 # maintain same values (as it has to be) print(happiness_sep_df) # resulting dataframe: