Exemple #1
0
print(df.isnull().sum())
print(df.info())
print(df['score'].value_counts())

# # Change Categorical values for Ordinal Variables
# scores={'Low': 0,'Medium' : 1,'High': 2}
# df['score']=df['score'].map(scores)
# #same thing in one line of code
# # df['score']=df['score'].map({'Low': 0,'Medium' : 1,'High': 2})
# print(df.head())

encoder = ce.OrdinalEncoder(cols=['score'],
                            return_df=True,
                            mapping=[{
                                'col': 'score',
                                'mapping': {
                                    'Low': 0,
                                    'Medium': 1,
                                    'High': 2
                                }
                            }])
newDF = encoder.fit_transform(df)

#Nominal Categorical Variables

#1 Pandas get_dummies

df_dummies = pd.get_dummies(newDF,
                            columns=['instructor', 'course', 'semester'],
                            drop_first=True)
print(df_dummies.head().T)
Exemple #2
0
def run_bs_experiments():
    print("Loading Data")
    df = load_data()
    #columns:
    continuous = ['temp', 'atemp', 'hum', 'windspeed']
    categorical = [
        'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
        'workingday', 'weathersit'
    ]

    X = df[continuous + categorical]
    y = df[['cnt']]

    models = [
        Ridge(),
        RandomForestRegressor(n_estimators=100),
        GradientBoostingRegressor(),
        MLPRegressor()
    ]
    #models = [RandomForestRegressor()]

    results = [[
        'model', 'Encoder', 'R2', 'STD', 'Training Time', 'Sparsity',
        'Dimensions'
    ]]

    for model in models:
        print("")
        print("----------------------")
        print("Testing Algorithm: ")
        print(type(model))
        print("----------------------")

        #TargetEncoder
        print("TargetEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.TargetEncoder(return_df=False))
        results.append([
            type(model), 'TargetEncoder', r2, std, time, sparsity, dimensions
        ])

        #OrdinalEncoder
        print("OrdinalEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.OrdinalEncoder(return_df=False))
        results.append([
            type(model), 'OrdinalEncoder', r2, std, time, sparsity, dimensions
        ])

        #BinaryEncoder
        print("BinaryEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.BinaryEncoder(return_df=False))
        results.append([
            type(model), 'BinaryEncoder', r2, std, time, sparsity, dimensions
        ])

        #HashingEncoder
        print("HashingEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.HashingEncoder(return_df=False))
        results.append([
            type(model), 'HashingEncoder', r2, std, time, sparsity, dimensions
        ])

        #OneHotEncoder
        print("OneHotEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=OneHotEncoder(handle_unknown='ignore', sparse=False))
        results.append([
            type(model), 'OneHotEncoder', r2, std, time, sparsity, dimensions
        ])

        print("GIG Encoder (mean) Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model, X, y, continuous, categorical, encoder=GIGEncoder())
        results.append([
            type(model), 'GIGEncoder (m)', r2, std, time, sparsity, dimensions
        ])

        print("GIG Encoder (mean and variance Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=GIGEncoder(),
            moments='mv')
        results.append([
            type(model), 'GIGEncoder (mv)', r2, std, time, sparsity, dimensions
        ])

    file = 'bike_sharing_experiments.csv'
    with open(file, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerows(results)
    try:
        upload_file(file)
    except:
        print("File Not Uploaded")
Exemple #3
0
train.drop(["id", "rent"], axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

use_cols = []

####################
## Preprocess data
####################

### location ###
train["districts"] = train["location"].apply(
    lambda x: re.search("(?<=都)(.*?)(?=区)", x).group())
test["districts"] = test["location"].apply(
    lambda x: re.search("(?<=都)(.*?)(?=区)", x).group())

ce_ordinal = ce.OrdinalEncoder(cols=["districts"], handle_missing="value")
train = ce_ordinal.fit_transform(train)
test = ce_ordinal.transform(test)
use_cols.append("districts")

### access ###
train["mins_to_nearest_sta"] = train["access"].apply(
    lambda x: min(map(int, re.findall("(?<=徒歩)(.*?)(?=分)", x))))
test["mins_to_nearest_sta"] = test["access"].apply(
    lambda x: min(map(int, re.findall("(?<=徒歩)(.*?)(?=分)", x))))
use_cols.append("mins_to_nearest_sta")

### layout ###
train["num_room"] = train["layout"].apply(
    lambda x: int(re.search("[0-9]", x).group()))
test["num_room"] = test["layout"].apply(
Exemple #4
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

        min_count = np.min(np.unique(y, return_counts=True)[1])
        if min_count < 9:
            self.params['cv_search'] = False
        if min_count < 3:
            self.params['grid_search_iterations'] = False
            self.params['cv_search'] = False

        # save pre-datatable-imputed X
        X_dt = X

        # Apply OOB imputation
        self.oob_imputer = OOBImpute(self._impute_num_type,
                                     self._impute_int_type,
                                     self._impute_bool_type,
                                     self._impute_cat_type, self._oob_bool,
                                     self._oob_cat)
        X = self.oob_imputer.fit_transform(X)

        # convert to pandas for sklearn
        X = X.to_pandas()
        X_orig_cols_names = list(X.columns)
        if self._kaggle_features:
            self.features = make_features()
            X = self.features.fit_transform(X)
        else:
            self.features = None
        # print("LR: pandas dtypes: %s" % (str(list(X.dtypes))))

        # FEATURE GROUPS

        # Choose which features are numeric or categorical
        cat_features = [
            x for x in X_orig_cols_names
            if CatOriginalTransformer.is_me_transformed(x)
        ]
        catlabel_features = [
            x for x in X_orig_cols_names if CatTransformer.is_me_transformed(x)
        ]
        # can add explicit column name list to below force_cats
        force_cats = cat_features + catlabel_features

        # choose if numeric is treated as categorical
        if not self._num_as_cat:
            numerical_features = (X.dtypes == 'float') | (
                X.dtypes == 'float32') | (X.dtypes == 'float64')
        else:
            numerical_features = X.dtypes == 'invalid'
            # force oob imputation for numerics
            self.oob_imputer = OOBImpute('oob', 'oob', 'oob',
                                         self._impute_cat_type, self._oob_bool,
                                         self._oob_cat)
            X = self.oob_imputer.fit_transform(X_dt)
            X = X.to_pandas()
            X = self.features.fit_transform(X)
        if self._kaggle_features:
            numerical_features = self.features.update_numerical_features(
                numerical_features)

        categorical_features = ~numerical_features
        # below can lead to overlap between what is numeric and what is categorical
        more_cats = (pd.Series([
            True if x in force_cats else False
            for x in list(categorical_features.index)
        ],
                               index=categorical_features.index))
        categorical_features = (categorical_features) | (more_cats)
        if self._kaggle_features:
            categorical_features = self.features.update_categorical_features(
                categorical_features)

        if self._debug:
            import uuid
            struuid = str(uuid.uuid4())
            Xy = X.copy()
            Xy.loc[:, 'target'] = y
            Xy.to_csv("munged_%s.csv" % struuid)

        cat_X = X.loc[:, categorical_features]
        num_X = X.loc[:, numerical_features]
        if self._debug:
            print("LR: Cat names: %s" % str(list(cat_X.columns)))
            print("LR: Num names: %s" % str(list(num_X.columns)))

        # TRANSFORMERS
        lr_params = copy.deepcopy(self.params)
        lr_params.pop('grid_search_by_iterations', None)
        lr_params.pop('cv_search', None)
        grid_search = False  # WIP

        full_features_list = []
        transformers = []
        if self._use_numerics and any(numerical_features.values):
            impute_params = {}
            impute_params['strategy'] = lr_params.pop('strategy', 'mean')
            full_features_list.extend(list(num_X.columns))
            transformers.append(
                (make_pipeline(SimpleImputer(**impute_params),
                               StandardScaler()), numerical_features))
        # http://contrib.scikit-learn.org/categorical-encoding/
        if self._use_ordinal_encoding and any(categorical_features.values):
            ord_params = dict(handle_missing='value', handle_unknown='value')
            full_features_list.extend(list(cat_X.columns))
            # Note: OrdinalEncoder doesn't handle unseen features, while CategoricalEncoder used too
            import category_encoders as ce
            transformers.append(
                (ce.OrdinalEncoder(**ord_params), categorical_features))
        if self._use_catboost_encoding and any(categorical_features.values):
            cb_params = dict(handle_missing='value', handle_unknown='value')
            cb_params['sigma'] = lr_params.pop('sigma')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.CatBoostEncoder(**cb_params), categorical_features))
        if self._use_woe_encoding and any(categorical_features.values):
            woe_params = dict(handle_missing='value', handle_unknown='value')
            woe_params['randomized'] = lr_params.pop('randomized')
            woe_params['sigma'] = lr_params.pop('sigma_woe')
            woe_params['regularization'] = lr_params.pop('regularization')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.WOEEncoder(**woe_params), categorical_features))
        if self._use_target_encoding and any(categorical_features.values):
            te_params = dict(handle_missing='value', handle_unknown='value')
            te_params['min_samples_leaf'] = lr_params.pop('min_samples_leaf')
            te_params['smoothing'] = lr_params.pop('smoothing')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.TargetEncoder(**te_params), categorical_features))
        if self._use_target_encoding_other and any(
                categorical_features.values):
            full_features_list.extend(list(cat_X.columns))
            len_uniques = []
            cat_X_copy = cat_X.copy()
            for c in cat_X.columns:
                le = LabelEncoder()
                le.fit(cat_X[c])
                cat_X_copy[c] = le.transform(cat_X_copy[c])
                len_uniques.append(len(le.classes_))
            if self._debug:
                uniques_series = pd.Series(len_uniques,
                                           index=list(cat_X.columns))
                print("uniques_series: %s" % uniques_series)
            ALPHA = 75
            MAX_UNIQUE = max(len_uniques)
            # FEATURES_COUNT = cat_X.shape[1]
            cv = StratifiedKFold(n_splits=5,
                                 shuffle=True,
                                 random_state=self.params['random_state'])
            split_cv = [cv]
            # split_cv = [3, 3]
            from target_encoding import TargetEncoder
            transformers.append(
                (TargetEncoder(alpha=ALPHA,
                               max_unique=MAX_UNIQUE,
                               split_in=split_cv), categorical_features))
        if self._use_ohe_encoding and any(categorical_features.values):
            transformers.append(
                (OneHotEncoder(handle_unknown='ignore',
                               sparse=True), categorical_features))
        assert len(transformers) > 0, "should have some features"

        preprocess = make_column_transformer(*transformers)

        # ESTIMATOR
        lr_defaults = dict(penalty='l2',
                           dual=False,
                           tol=1e-4,
                           C=1.0,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None,
                           random_state=None,
                           solver='warn',
                           max_iter=100,
                           multi_class='warn',
                           verbose=0,
                           warm_start=False,
                           n_jobs=None,
                           l1_ratio=None)
        allowed_lr_kwargs_keys = lr_defaults.keys()
        lr_params_copy = copy.deepcopy(lr_params)
        for k, v in lr_params_copy.items():
            if k not in allowed_lr_kwargs_keys:
                lr_params.pop(k, None)
        del lr_params_copy

        can_score = self.num_classes == 2 and 'AUC' in self.params_base[
            'score_f_name'].upper()
        # print("LR: can_score: %s" % str(can_score))
        if can_score:
            scorer = make_scorer(roc_auc_score,
                                 greater_is_better=True,
                                 needs_proba=True)
        else:
            scorer = None

        if not ('C' in lr_params or 'l1_ratios' in lr_params):
            # override
            self.params['cv_search'] = False

        if not self.params['cv_search']:
            estimator = LogisticRegression(**lr_params)
            estimator_name = 'logisticregression'
        else:
            lr_params_cv = copy.deepcopy(lr_params)
            if 'C' in lr_params:
                lr_params_cv['Cs'] = self.get_param_range(
                    self.params['C'],
                    self.params['fit_count'],
                    func_type='log')
                # print("LR: CV: Cs: %s" % str(lr_params_cv['Cs']))
            if 'l1_ratios' in lr_params:
                lr_params_cv['l1_ratios'] = self.get_param_range(
                    self.params['l1_ratio'],
                    self.params['fit_count'],
                    func_type='linear')
                # print("LR: CV: l1_ratios: %s" % str(lr_params_cv['l1_ratios']))
            lr_params_cv.pop('n_jobs', None)
            lr_params_cv.pop('C', None)
            lr_params_cv.pop('l1_ratio', None)
            if lr_params_cv['penalty'] == 'none':
                lr_params_cv['penalty'] = 'l2'
            estimator = LogisticRegressionCV(n_jobs=self.params['n_jobs'],
                                             cv=3,
                                             refit=True,
                                             scoring=scorer,
                                             **lr_params_cv)
            estimator_name = 'logisticregressioncv'

        # PIPELINE
        model = make_pipeline(preprocess, estimator)

        # FIT
        if self.params['grid_search_iterations'] and can_score:
            # WIP FIXME for multiclass and other scorers
            from sklearn.model_selection import GridSearchCV

            max_iter_range = self.get_param_range(
                self.params['max_iter'],
                self.params['fit_count'],
                range_limit=self._overfit_limit_iteration_step,
                func_type='log')
            # print("LR: max_iter_range: %s" % str(max_iter_range))
            param_grid = {
                '%s__max_iter' % estimator_name: max_iter_range,
            }
            grid_clf = GridSearchCV(model,
                                    param_grid,
                                    n_jobs=self.params['n_jobs'],
                                    cv=3,
                                    iid=True,
                                    refit=True,
                                    scoring=scorer)
            grid_clf.fit(X, y)
            model = grid_clf.best_estimator_
            # print("LR: best_index=%d best_score: %g best_params: %s" % (
            #    grid_clf.best_index_, grid_clf.best_score_, str(grid_clf.best_params_)))
        elif grid_search:
            # WIP
            from sklearn.model_selection import GridSearchCV

            param_grid = {
                'columntransformer__pipeline__simpleimputer__strategy':
                ['mean', 'median'],
                '%s__C' % estimator_name: [0.1, 0.5, 1.0],
            }
            grid_clf = GridSearchCV(model, param_grid, cv=10, iid=False)
            grid_clf.fit(X, y)
            model = grid_clf.best_estimator_
            # self.best_params = grid_clf.best_params_
        else:
            model.fit(X, y)

        # get actual LR model
        lr_model = model.named_steps[estimator_name]

        if self._debug and False:
            import uuid
            struuid = str(uuid.uuid4())
            save_obj(
                model.named_steps['columntransformer'].fit_transform(X, y),
                "columns_csr_%s.pkl" % struuid)

        # average importances over classes
        importances = np.average(np.array(lr_model.coef_), axis=0)
        # average iterations over classes (can't take max_iter per class)
        iterations = np.average(lr_model.n_iter_)
        # print("LR: iterations: %d" % iterations)

        # reduce OHE features to original names
        ohe_features_short = []
        if self._use_ohe_encoding and any(categorical_features.values):
            if self._use_ohe_encoding:
                input_features = [x + self._ohe_postfix for x in cat_X.columns]
                ohe_features = pd.Series(
                    model.named_steps['columntransformer'].
                    named_transformers_['onehotencoder'].get_feature_names(
                        input_features=input_features))

                def f(x):
                    return '_'.join(x.split(self._ohe_postfix + '_')[:-1])

                # identify OHE features
                ohe_features_short = ohe_features.apply(lambda x: f(x))
                full_features_list.extend(list(ohe_features_short))

        # aggregate our own features
        if self._kaggle_features:
            self.features.aggregate(full_features_list, importances)

        msg = "LR: num=%d cat=%d : ohe=%d : imp=%d full=%d" % (
            len(num_X.columns), len(cat_X.columns), len(ohe_features_short),
            len(importances), len(full_features_list))
        if self._debug:
            print(msg)
        assert len(importances) == len(full_features_list), msg

        # aggregate importances by dai feature name
        importances = pd.Series(
            np.abs(importances),
            index=full_features_list).groupby(level=0).mean()
        assert len(importances) == len(
            X_orig_cols_names), "%d %d %s : %s %s" % (
                len(importances), len(X_orig_cols_names), msg,
                str(list(X.columns)), str(list(X.dtypes)))

        # save hyper parameter searched results for next search
        self.params['max_iter'] = iterations
        if self.params['cv_search']:
            self.params['C'] = np.average(lr_model.C_, axis=0)
        if 'l1_ratios' in lr_params and self.params['cv_search']:
            self.params['l1_ratio'] = np.average(lr_model.l1_ratio_, axis=0)
        if 'fit_count' in self.params:
            self.params['fit_count'] += 1
        else:
            self.params['fit_count'] = 0

        self.set_model_properties(model=(model, self.features),
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=iterations)
        self.features = None
def main():
    train_pitch = pd.read_csv(TRAIN_PITCH_PATH)
    train_player = pd.read_csv(TRAIN_PLAYER_PATH)
    test_pitch = pd.read_csv(TEST_PITCH_PATH)
    test_player = pd.read_csv(TEST_PLAYER_PATH)

    train_pitch["use"] = "train"
    test_pitch["use"] = "test"
    test_pitch["投球位置区域"] = 0
    pitch_data = pd.concat([train_pitch, test_pitch],
                           axis=0).drop(PITCH_REMOVAL_COLUMNS, axis=1)

    player_data = pd.concat([train_player, test_player],
                            axis=0).drop(PLAYER_REMOVAL_COLUMNS,
                                         axis=1)  #.fillna(0)
    pitchers_data = train_player[train_player["位置"] == "投手"].drop(
        PLAYER_REMOVAL_COLUMNS, axis=1)

    merged = pd.merge(
        pitch_data,
        player_data,
        how="left",
        left_on=['年度', '投手ID'],
        right_on=['年度', '選手ID'],
    ).drop(['選手ID', '球種'], axis=1).fillna(0)
    merged = merged.rename(columns={"選手名": "投手名", "チーム名": "投手チーム名"})

    use = merged.loc[:, "use"]
    label = merged.loc[:, "投球位置区域"]
    merged = merged.drop(["use", "投球位置区域", "位置", "年度", "投手名"], axis=1)

    # category_encodersによってカテゴリ変数をencordingする
    categorical_columns = [
        c for c in merged.columns if merged[c].dtype == 'object'
    ]
    ce_oe = ce.OrdinalEncoder(cols=categorical_columns,
                              handle_unknown='impute')
    encorded_data = ce_oe.fit_transform(merged)
    encorded_data = cf.standardize(encorded_data)

    encorded_data = pd.concat([encorded_data, use, label], axis=1)

    train = encorded_data[encorded_data["use"] == "train"].drop(
        "use", axis=1).reset_index(drop=True)
    test = encorded_data[encorded_data["use"] == "test"].drop(
        "use", axis=1).reset_index(drop=True)

    train_x = train.drop("投球位置区域", axis=1)
    train_y = train.loc[:, "投球位置区域"].astype(int)
    test_x = test.drop("投球位置区域", axis=1).reset_index(drop=True)

    # f = partial(objective, train_x, train_y) # 目的関数に引数を固定しておく
    # study = optuna.create_study(direction='maximize') # Optuna で取り出す特徴量の数を最適化する

    # study.optimize(f, n_trials=10) # 試行回数を決定する
    # print('params:', study.best_params)# 発見したパラメータを出力する
    # best_feature_count = study.best_params['n_components']
    # train_x_pca, test_x_pca = get_important_features(train_x, test_x, best_feature_count)

    num_class = 13
    # best_params = get_best_params(train_x_pca, train_y, num_class) # 最適ハイパーパラメータの探索

    n_splits = 5
    for depth, num in zip(DEPTH_NUMS, range(13, 18)):
        submission = np.zeros((len(test_x), num_class))
        print("################################")
        print(f"start {depth} depth !!")
        print("################################")
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
        for i, (tr_idx, val_idx) in enumerate(skf.split(train_x, train_y)):
            tr_x = train_x.iloc[tr_idx].reset_index(drop=True)
            tr_y = train_y.iloc[tr_idx].reset_index(drop=True)

            model = get_rf_model(tr_x, tr_y, depth)
            y_preda = model.predict_proba(test_x)
            submission += y_preda

        submission_df = pd.DataFrame(submission) / n_splits
        submission_df.to_csv(f"{DATA_DIR}/submission_pitching_course{num}.csv",
                             header=False)
        print("#################################")
        print(submission_df)
        print("#################################")
def read_data(input_data_dir='../../data/', output_dir='./'):
    train_data = pd.read_csv(f'{input_data_dir}/sales_train_evaluation.csv')
    sell_prices = pd.read_csv(f'{input_data_dir}/sell_prices.csv')
    calendar = pd.read_csv(f'{input_data_dir}/calendar.csv')

    # ---- process calendar features ---- #
    print('* Processing calendar features')

    calendar.date = pd.to_datetime(calendar.date)
    calendar['relative_year'] = 2016 - calendar.year

    # convert month, day and weekday to cyclic encodings
    calendar['month_sin'] = np.sin(2 * np.pi * calendar.month / 12.0)
    calendar['month_cos'] = np.cos(2 * np.pi * calendar.month / 12.0)
    calendar['day_sin'] = np.sin(2 * np.pi * calendar.date.dt.day /
                                 calendar.date.dt.days_in_month)
    calendar['day_cos'] = np.cos(2 * np.pi * calendar.date.dt.day /
                                 calendar.date.dt.days_in_month)
    calendar['weekday_sin'] = np.sin(2 * np.pi * calendar.wday / 7.0)
    calendar['weekday_cos'] = np.cos(2 * np.pi * calendar.wday / 7.0)

    # use same encoded labels for both the event name columns
    cal_label = ['event_name_1', 'event_name_2']
    cal_label_encoded_cols = ['event_name_1_enc', 'event_name_2_enc']
    calendar[cal_label_encoded_cols] = calendar[cal_label]
    cal_label_encoder = ce.OrdinalEncoder(cols=cal_label_encoded_cols)
    cal_label_encoder.fit(calendar)
    cal_label_encoder.mapping[1]['mapping'] = cal_label_encoder.mapping[0][
        'mapping']
    calendar = cal_label_encoder.transform(calendar)

    # subtract one from label encoded as pytorch uses 0-indexing
    for col in cal_label_encoded_cols:
        calendar[col] = calendar[col] - 1

    calendar_df = calendar[[
        'wm_yr_wk', 'd', 'snap_CA', 'snap_TX', 'snap_WI', 'relative_year',
        'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin',
        'weekday_cos'
    ] + cal_label_encoded_cols]

    # ---- Merge all dfs, keep calender_df features separate and just concat them for each batch ---- #
    train_data.id = train_data.id.str[:-11]
    sell_prices['id'] = sell_prices['item_id'] + '_' + sell_prices['store_id']

    # add empty columns for future data
    train_data = pd.concat([
        train_data,
        pd.DataFrame(columns=['d_' + str(i) for i in range(1942, 1970)])
    ])

    # Encode categorical features using either one-hot or label encoding (for embeddings)
    print('* Encoding categorical features')
    label = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    label_encoded_cols = [str(i) + '_enc' for i in label]

    train_data[label_encoded_cols] = train_data[label]
    label_encoder = ce.OrdinalEncoder(cols=[str(i) + '_enc' for i in label])
    label_encoder.fit(train_data)
    train_data = label_encoder.transform(train_data)

    # subtract one from label encoded as pytorch uses 0-indexing
    for col in label_encoded_cols:
        train_data[col] = train_data[col] - 1

    # Reshape, change dtypes and add previous day sales
    print('* Add previous day sales and merge sell prices')
    data_df = pd.melt(train_data,
                      id_vars=[
                          'id', 'item_id', 'dept_id', 'cat_id', 'store_id',
                          'state_id', 'item_id_enc', 'dept_id_enc',
                          'cat_id_enc', 'store_id_enc', 'state_id_enc'
                      ],
                      var_name='d',
                      value_vars=['d_' + str(i) for i in range(1, 1970)],
                      value_name='sales')

    # change dtypes to reduce memory usage
    data_df[['sales']] = data_df[['sales']].fillna(-2).astype(
        np.int16)  # fill future sales as -2
    calendar_df[['snap_CA', 'snap_TX', 'snap_WI',
                 'relative_year']] = calendar_df[[
                     'snap_CA', 'snap_TX', 'snap_WI', 'relative_year'
                 ]].astype(np.int8)
    calendar_df[cal_label_encoded_cols] = calendar_df[
        cal_label_encoded_cols].astype(np.int16)

    data_df[label_encoded_cols] = data_df[label_encoded_cols].astype(np.int16)

    # merge sell prices
    data_df = data_df.merge(right=calendar_df[['d', 'wm_yr_wk']],
                            on=['d'],
                            how='left')
    data_df = data_df.merge(right=sell_prices[['id', 'wm_yr_wk',
                                               'sell_price']],
                            on=['id', 'wm_yr_wk'],
                            how='left')

    data_df.sell_price = data_df.sell_price.fillna(0.0)
    data_df['prev_day_sales'] = data_df.groupby(['id'])['sales'].shift(1)

    # remove data for d_1
    data_df.dropna(axis=0, inplace=True)
    calendar_df = calendar_df[calendar_df.d != 'd_1']

    # change dtypes
    data_df[['prev_day_sales']] = data_df[['prev_day_sales']].astype(np.int16)

    # ---- Add previous day totals of aggregated series as features ---- #
    # print('* Add previous day totals of aggregated series as features')
    # # total
    # data_df = data_df.merge(right=
    #                         data_df.groupby(['d'])[['prev_day_sales']].sum().astype(
    #                             np.int32).add_suffix('_all').reset_index(),
    #                         on=['d'], how='left')
    # # category level
    # data_df = data_df.merge(right=data_df.groupby(['d', 'cat_id'])[['prev_day_sales']].sum().astype(
    #                             np.int32).reset_index().pivot(
    #                             index='d', columns='cat_id', values='prev_day_sales').add_prefix('prev_d_cat_'),
    #                         on=['d'], how='left')
    # # state level
    # data_df = data_df.merge(right=
    #                         data_df.groupby(['d', 'state_id'])[['prev_day_sales']].sum().astype(
    #                             np.int32).reset_index().pivot(
    #                             index='d', columns='state_id', values='prev_day_sales').add_prefix('prev_d_state_'),
    #                         on=['d'], how='left')
    # # store level
    # data_df = data_df.merge(right=
    #                         data_df.groupby(['d', 'store_id'])[['prev_day_sales']].sum().astype(
    #                             np.int32).reset_index().pivot(
    #                             index='d', columns='store_id', values='prev_day_sales').add_prefix('prev_d_store_'),
    #                         on=['d'], how='left')
    # # department level
    # data_df = data_df.merge(right=
    #                         data_df.groupby(['d', 'dept_id'])[['prev_day_sales']].sum().astype(
    #                             np.int32).reset_index().pivot(
    #                             index='d', columns='dept_id', values='prev_day_sales').add_prefix('prev_d_dept_'),
    #                         on=['d'], how='left')

    # remove category columns
    del data_df['wm_yr_wk']
    del data_df['item_id']
    del data_df['dept_id']
    del data_df['cat_id']
    del data_df['store_id']
    del data_df['state_id']

    num_samples = data_df.id.nunique()
    num_timesteps = data_df.d.nunique()
    data_df = data_df.set_index(['id', 'd'])

    ids = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    enc_dec_feats = ['sell_price'] + label_encoded_cols
    enc_only_feats = data_df.columns.difference(
        ['sales', 'sell_price', 'prev_day_sales'] + enc_dec_feats)

    sales_data_ids = train_data[ids].values
    Y = data_df.sales.values.reshape(num_timesteps, num_samples).T
    X_enc_only_feats = np.array(data_df[enc_only_feats]).reshape(
        num_timesteps, num_samples, -1)
    X_enc_dec_feats = np.array(data_df[enc_dec_feats]).reshape(
        num_timesteps, num_samples, -1)
    X_prev_day_sales = data_df.prev_day_sales.values.reshape(
        num_timesteps, num_samples)
    calendar_index = calendar_df.d
    X_calendar = np.array(calendar_df.iloc[:, 2:])
    X_calendar_cols = list(calendar_df.columns[2:])

    # # for prev_day_sales and sales (y), set value as -1 for the period the product was not actively sold
    # for idx, first_non_zero_idx in enumerate((X_prev_day_sales != 0).argmax(axis=0)):
    #     X_prev_day_sales[:first_non_zero_idx, idx] = -1
    # for idx, first_non_zero_idx in enumerate((Y != 0).argmax(axis=1)):
    #     Y[idx, :first_non_zero_idx] = -1

    # ---- Save processed data ---- #
    print('* Save processed data')
    data_dict = {
        'sales_data_ids': sales_data_ids,
        'calendar_index': calendar_index,
        'X_prev_day_sales': X_prev_day_sales,
        'X_enc_only_feats': X_enc_only_feats,
        'X_enc_dec_feats': X_enc_dec_feats,
        'enc_dec_feat_names': enc_dec_feats,
        'enc_only_feat_names': enc_only_feats,
        'X_calendar': X_calendar,
        'X_calendar_cols': X_calendar_cols,
        'Y': Y,
        'cal_label_encoder': cal_label_encoder,
        'label_encoder': label_encoder
    }

    # pickle data
    with open(f'{output_dir}/data.pickle', 'wb') as f:
        pkl.dump(data_dict, f, protocol=pkl.HIGHEST_PROTOCOL)
Exemple #7
0
# n_estimators = 1000, min_samples_leaf = 2

pipeline = Pipeline([('StandardScaler', _ss), ('PCA', _pca),
                     ('RandomForestClassifier', _rfc)])

searchCV = RandomizedSearchCV(pipeline,
                              param_distributions=params,
                              n_iter=5,
                              cv=3,
                              scoring='accuracy',
                              verbose=10,
                              return_train_score=True,
                              n_jobs=-1)

target_encoder = ce.OrdinalEncoder()
train_target_encoded = target_encoder.fit_transform(train_target)
train_target_encoded

searchCV.fit(train_features, train_target_encoded)

#%%
print('Cross-validation accuracy', searchCV.best_score_)
print('Best hyperparameters', searchCV.best_params_)

#%%
out = test_features[['id']].copy()

#%%
train_features.shape
Exemple #8
0
 def fit_(self, df: pd.DataFrame, columns: list, target: str):
     self.encoder = ce.OrdinalEncoder(
         cols=columns, handle_unknown="value", handle_missing="value"
     )
     self.encoder.fit(df.loc[:, columns])
Exemple #9
0
df['checkin_day'] = checkin.dt.day
df['checkin_month'] = checkin.dt.month
df['checkin_year'] = checkin.dt.year

checkout = pd.to_datetime(df['booking_check_out'])
df['checkout_day'] = checkout.dt.day
df['checkout_month'] = checkout.dt.month
df['checkout_year'] = checkout.dt.year

dates = df[[
    'checkin_day', 'checkin_month', 'checkin_year', 'checkout_day',
    'checkout_month', 'checkout_year'
]]
#checkout = df['booking_check_out'].dt.date
# encode IDs in ordinal
ids = ce.OrdinalEncoder(
    cols=['listing_id', 'unit_id', 'property_id', 'area_id'])
ids = ids.fit_transform(df)
ids = ids[['listing_id', 'unit_id', 'property_id', 'area_id']]

# encode property
pType = pd.get_dummies(df.property_type, prefix="type")
pDesign = pd.get_dummies(df.property_design, prefix='design')

#encode earnings
earnings = df['usd']

# concatinate all df
cproperty = pd.concat([pType, pDesign], axis='columns')

dummies = pd.concat([dates, ids, cproperty, earnings], axis='columns')
#print(dummies)
Exemple #10
0
    ]
]
doPermutationTests(X, y, features, 'sum')

encoder = ce.LeaveOneOutEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)
results.append(doAccuracyTests(X, y, 'leaveoneout'))
features = ['diameter', 'color']
doPermutationTests(X, y, features, 'leaveoneout')

encoder = ce.TargetEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)
results.append(doAccuracyTests(X, y, 'target'))
features = ['diameter', 'color']
doPermutationTests(X, y, features, 'target')

encoder = ce.OrdinalEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)
results.append(doAccuracyTests(X, y, 'ordinal'))
features = ['diameter', 'color']
doPermutationTests(X, y, features, 'ordinal')

encoder = ce.WOEEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)
results.append(doAccuracyTests(X, y, 'woe'))
features = ['diameter', 'color']
doPermutationTests(X, y, features, 'woe')

df = pd.DataFrame(results, columns=['encoding', 'knn', 'rfc', 'gnb'])
df.to_csv('./acc/p10_cv50.csv')
Exemple #11
0
 def _fit_ordinal(self, df, target):
     ordinal_encoder = ce.OrdinalEncoder()
     ordinal_encoder.fit(df[target].map(to_str))
     name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_ordinal' for x in
             ordinal_encoder.get_feature_names()]
     self.trans_ls.append(('ordinal', name, target, ordinal_encoder))
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy","Recall Score","F1 Score","Precision Score"]
log = pd.DataFrame(columns=log_cols)

def map_for_emb(emb):
    return {'C':1,'S':2,'Q':3}.get(emb,10)
data=pd.read_csv('/data.csv')
data.head()
labelencoder = LabelEncoder()
data['Gender_cat'] = labelencoder.fit_transform(data['Gender'])
data=data.drop(['Gender'],axis=1)
data.rename(columns={'Gender_cat':'Gender'},inplace=True)
data.head()
y=data['Survived'].values
data=data.drop(['PassengerId','Survived'],axis=1)
enc=ce.OrdinalEncoder(cols=['Embarked'],return_df=True)
data=enc.fit_transform(data)
X=data.values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=0)

print(data['PClass'].unique())
print(data['Sibling'].unique())
print(data['Embarked'].unique())
print(data['Gender'].unique())

X

y

accuracies=[]
models=[]
Exemple #13
0
def preprocessing(df):
    """
  Preprocesses the data.

  Input: DataFrame

  Output: X_train, X_test, y_train, y_test
  """
    # Copying DF
    dfx = df.copy()

    ## EDA
    # Dropping Columns
    dfx.drop(columns=["host_name", "last_review", "reviews_per_month"],
             inplace=True)

    # Removing -- Custom Outliers
    dfx = dfx[(dfx["price"] > 0) & (dfx["price"] < 10000)]

    # New Column -- 'log_price'
    dfx["log_price"] = np.log(dfx["price"].values)

    # Target and Features
    target = "log_price"
    features = [
        "neighbourhood_group", "neighbourhood", "latitude", "longitude",
        "room_type", "minimum_nights", "number_of_reviews",
        "calculated_host_listings_count", "availability_365"
    ]

    # X Features Matrix
    X = dfx[features]

    # y target vector
    y = dfx[target]

    # Mapping - 'room_type'
    room_type_dict = {
        "Shared room": 1,
        "Private room": 2,
        "Entire home/apt": 3
    }
    X.iloc[:, 4].map(room_type_dict)
    # print(X["room_type"])

    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.2,
                                                        random_state=42)

    # Preprocess Pipeline -- OrdinalEncoder and StandardScaler
    preprocess = make_pipeline(ce.OrdinalEncoder(), StandardScaler())

    # Fit Transform and Transform Training and Testing Data
    X_train = preprocess.fit_transform(X_train)
    X_test = preprocess.transform(X_test)

    # Create DataFrame for X Matrices
    X_train_df = pd.DataFrame(X_train, columns=features)
    X_test_df = pd.DataFrame(X_test, columns=features)
    print(X_train_df.shape, X_test_df.shape, X_train.shape, X_test.shape,
          y_train.shape, y_test.shape)

    # Return: X_test_df, X_train, X_test, y_train, y_test
    return X_train_df, X_test_df, X_train, X_test, y_train, y_test
Exemple #14
0
    y_trainval,
    test_size=0.05,
    train_size=0.10,
    stratify=y_trainval,
    random_state=42,
)

train_id = X_train["id"]
val_id = X_val["id"]
test_id = X_test["id"]

X_train = X_train.drop("id", axis=1)
X_val = X_val.drop("id", axis=1)
X_test = X_test.drop("id", axis=1)

x_processor = make_pipeline(ce.OrdinalEncoder(),
                            SimpleImputer(strategy="median"))
y_processor = make_pipeline(ce.OrdinalEncoder(),
                            SimpleImputer(strategy="median"))

cols = X_train.columns
len(cols)


def prepare_inputs(X_train, X_val, X_test):
    X_train_enc = pd.DataFrame(x_processor.fit_transform(X_train),
                               columns=cols)
    X_val_enc = pd.DataFrame(x_processor.transform(X_val), columns=cols)
    X_test_enc = pd.DataFrame(x_processor.transform(X_test), columns=cols)
    return X_train_enc, X_val_enc, X_test_enc
Exemple #15
0
train['K'] = train['FloorPlan'].map(lambda x: 1 if 'K' in str(x) else 0)
train['S'] = train['FloorPlan'].map(lambda x: 1 if 'S' in str(x) else 0)
train['R'] = train['FloorPlan'].map(lambda x: 1 if 'R' in str(x) else 0)
train['Maisonette'] = train['FloorPlan'].map(lambda x: 1
                                             if 'メゾネット' in str(x) else 0)
train['OpenFloor'] = train['FloorPlan'].map(lambda x: 1
                                            if 'オープンフロア' in str(x) else 0)
train['Studio'] = train['FloorPlan'].map(lambda x: 1
                                         if 'スタジオ' in str(x) else 0)

Label_Enc_list = [
    'Type', 'NearestStation', 'FloorPlan', 'CityPlanning', 'Structure',
    'Direction', 'Classification', 'Municipality', 'Region', 'Remarks',
    'Renovation'
]
ce_oe = ce.OrdinalEncoder(cols=Label_Enc_list, handle_unknown='impute')
# 文字を序数に変換
train = ce_oe.fit_transform(train)
# 値を1の始まりから0の始まりにする
for i in Label_Enc_list:
    train[i] = train[i] - 1
# intに変換
for i in Label_Enc_list:
    train[i] = train[i].astype("int")

#------------------------
# 予測モデルの作成、学習
#------------------------
# 目的変数と説明変数を代入
X = train[[
    'TimeToNearestStation', 'FloorAreaRatio', 'CityPlanning', 'BuildingAD',
 def LabelEncoding(self, data, column):
     encoder = ce.OrdinalEncoder(cols=[column], return_df=True)
     return encoder.fit_transform(data)
Exemple #17
0
 def test_numbers_as_strings_with_numpy_output(self):
     # see issue #229
     X = np.array(['11', '12', '13', '14', '15'])
     oe = encoders.OrdinalEncoder(return_df=False)
     oe.fit(X)
Exemple #18
0
importances1 = pd.Series(rf.feature_importances_, encoded.columns)
# Plot feature importances
n = 20
plt.figure(figsize=(10,n/2))
plt.title(f'Top {n} features')
importances1.sort_values()[-n:].plot.barh(color='grey');

# Commented out IPython magic to ensure Python compatibility.
# Generate validation curves
# %matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve
from sklearn.tree import DecisionTreeClassifier
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(),
    DecisionTreeClassifier()
)

depth = range(1, 10, 2)
train_scores, val_scores = validation_curve(
    pipeline, X_train, y_train,
    param_name='decisiontreeclassifier__max_depth',
    param_range=depth, scoring='accuracy',
    cv=3,
    n_jobs=-1
)
    
plt.figure(dpi=150)
plt.plot(depth, np.mean(train_scores, axis=1), color='blue', label='training error')
Exemple #19
0
features = dataset.drop(dataset.columns[-1], axis=1)
target = dataset.iloc[:, -1]

import warnings
warnings.filterwarnings("ignore")
"""START: Import encoders"""
import category_encoders as ce
import sys
sys.path.append('../encoders/')
from ceng import CENGEncoder
from cesamo import CESAMOEncoder
from entity_embedding import EntityEmbeddingEncoder
from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder

Encoders = {
    'Ordinal': ce.OrdinalEncoder(),
    'Polynomial': ce.PolynomialEncoder(),
    'OneHot': ce.OneHotEncoder(),
    'BackwardDifference': ce.BackwardDifferenceEncoder(),
    'Helmert': ce.HelmertEncoder(),
    'EntityEmbedding': EntityEmbeddingEncoder(),
    'TargetEnc': ce.TargetEncoder(),
    'WOE': ce.WOEEncoder(),
    'CENG': CENGEncoder(verbose=0),
    'GeneticPP': GeneticPPEncoder(),
    'AgingPP': AgingPPEncoder(),
    'SimplePP': SimplePPEncoder(),
    'CESAMOEncoder': CESAMOEncoder()
}
"""END: Import encoders"""
"""START: Import models"""
Exemple #20
0
data_set = pd.read_csv("data.csv", sep=",", header=0)

#data_set.head(3)

# In[3]:

#time をone hotラベルに

import category_encoders as ce

# Eoncodeしたい列をリストで指定。もちろん複数指定可能。
list_cols = ['time']

# OneHotEncodeしたい列を指定。Nullや不明の場合の補完方法も指定。
#ce_ohe = ce.OneHotEncoder(cols=list_cols,handle_unknown='impute')
ce_oe = ce.OrdinalEncoder(cols=list_cols, handle_unknown='impute')

# pd.DataFrameをそのまま突っ込む
df_session_ce_ordinal = ce_oe.fit_transform(data_set)

#df_session_ce_ordinal.head(350)

# In[4]:

#print(df_session_ce_ordinal.columns.values)

# In[28]:

# データの分割
(train, test) = train_test_split(df_session_ce_ordinal,
                                 test_size=0.2,
Exemple #21
0
df['date_recorded'] = pandas.to_datetime(df['date_recorded']).dt.year
df['date_recorded'] = df['date_recorded'].astype('int32')
df['construction_year'] = df['construction_year'].astype('int32')
df['construction_year'] = df['construction_year'].replace(0, np.nan)
df = df.dropna(subset=['construction_year'])
df['date_recorded'] = df['date_recorded'] - df['construction_year']

# drop redundant features
df.drop(df.columns[[
    0, 8, 9, 11, 12, 13, 14, 15, 16, 19, 21, 23, 25, 26, 28, 30, 34, 36, 37, 39
]],
        axis=1,
        inplace=True)

# transform categorical variables to numeric
encoder = ce.OrdinalEncoder(cols=['status_group'])
df = encoder.fit_transform(df)
df = df.apply(pandas.to_numeric, errors='ignore')
encoder = ce.BinaryEncoder()
df = encoder.fit_transform(df)
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

# Store contents of df into an array
array = df.values
X = array[:, 0:67]
Y = array[:, 68]

# run Ada Boost algorithm
seed = 7
k = 10
 'OneHot': ce.OneHotEncoder(cols=ordinal_features),
 'Ordinal': ce.OrdinalEncoder(mapping=[
     {'col': 'ExterQual',
      'mapping': ordinal_mapping_1},
     {'col': 'ExterCond',
      'mapping': ordinal_mapping_1},
     {'col': 'BsmtQual',
      'mapping': ordinal_mapping_1},
     {'col': 'BsmtCond',
      'mapping': ordinal_mapping_1},
     {'col': 'BsmtExposure',
      'mapping': ordinal_mapping_2},
     {'col': 'BsmtFinType1',
      'mapping': ordinal_mapping_3},
     {'col': 'BsmtFinType2',
      'mapping': ordinal_mapping_3},
     {'col': 'HeatingQC',
      'mapping': ordinal_mapping_1},
     {'col': 'KitchenQual',
      'mapping': ordinal_mapping_1},
     {'col': 'FireplaceQu',
      'mapping': ordinal_mapping_1},
     {'col': 'GarageQual',
      'mapping': ordinal_mapping_1},
     {'col': 'GarageCond',
      'mapping': ordinal_mapping_1},
     {'col': 'PoolQC',
      'mapping': ordinal_mapping_1},
     {'col': 'Fence',
      'mapping': ordinal_mapping_4}],
     cols=ordinal_features),
 'Binary Ordinal': ce.OrdinalEncoder(mapping=[
def ordinal_encode(data):
    encoding_data = data.copy()
    encoder = ce.OrdinalEncoder(encoding_data)
    data_encoded = encoder.fit_transform(encoding_data)
    return (data_encoded)
Exemple #24
0
ImputedX=imputer.fit_transform(X)

# Convert output to a data frame to show the stats
imputed_df = pd.DataFrame.from_records(ImputedX)
imputed_df.columns = features
imputed_df['Country'] = swine_data['Country']
imputed_df['Cases'] = swine_data['Cases']
imputed_df['Update Time'] = swine_data['Update Time']
# print('---------------------------------------')
missing_0_values_count = imputed_df.isnull().sum()
# print(missing_0_values_count)


# Categorical Encoders
import category_encoders as ce
enc = ce.OrdinalEncoder(cols=["Country","Update Time"],handle_missing='return_nan',return_df= True)

#We now fit the model and transform the data and put it in X which is a dataframe
X=enc.fit_transform(imputed_df)


# Outlier Detection
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = clf.fit_predict(X)
totalOutliers=0
for pred in y_pred:
    if pred == -1:
        totalOutliers=totalOutliers+1
print ("Number of predicted outliers:",totalOutliers)
Exemple #25
0
    'vote.arff', 'vowel.arff'
]

# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [  #category_encoders.BackwardDifferenceEncoder(),
    category_encoders.BaseNEncoder(),
    category_encoders.BinaryEncoder(),
    category_encoders.GaussEncoder(),
    category_encoders.HashingEncoder(),
    # category_encoders.HelmertEncoder(),
    category_encoders.LeaveOneOutEncoder(),
    category_encoders.LogOddsRatioEncoder(),
    category_encoders.MEstimateEncoder(),
    category_encoders.OneHotEncoder(),
    category_encoders.OrdinalEncoder(),
    # category_encoders.PolynomialEncoder(),
    # category_encoders.SumEncoder(),
    category_encoders.TargetEncoder(),
    category_encoders.WOEEncoder()
]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
    X, y, fold_count = arff_loader.load(dataset_name)
    non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values)
    for encoder in encoders:
Exemple #26
0
def run_a_experiments():
    print("Loading Data")
    df = load_data()
    #columns:
    continuous = [
        'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
        'hours-per-week'
    ]
    categorical = [
        'workclass', 'education', 'marital-status', 'occupation',
        'relationship', 'race', 'sex', 'native-country'
    ]

    X = df[continuous + categorical]
    y = df[['class']]

    successes = y.sum()[0]
    alpha_prior = float(successes / len(y))

    models = [
        LogisticRegression(solver='lbfgs'),
        RandomForestClassifier(n_estimators=100),
        GradientBoostingClassifier(),
        MLPClassifier()
    ]
    #models = [RandomForestClassifier()]

    results = [[
        'model', 'Encoder', 'Accuracy', 'STD', 'Training Time', 'Sparsity',
        'Dimensions'
    ]]

    for model in models:
        print("")
        print("----------------------")
        print("Testing Algorithm: ")
        print(type(model))
        print("----------------------")

        #TargetEncoder
        print("TargetEncoder Results:")
        acc, std, time, sparsity, dimensions = cv_binary_classification(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.TargetEncoder(return_df=False))
        results.append([
            type(model), 'TargetEncoder', acc, std, time, sparsity, dimensions
        ])

        #OrdinalEncoder
        print("OrdinalEncoder Results:")
        acc, std, time, sparsity, dimensions = cv_binary_classification(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.OrdinalEncoder(return_df=False))
        results.append([
            type(model), 'OrdinalEncoder', acc, std, time, sparsity, dimensions
        ])

        #BinaryEncoder
        print("BinaryEncoder Results:")
        acc, std, time, sparsity, dimensions = cv_binary_classification(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.BinaryEncoder(return_df=False))
        results.append([
            type(model), 'BinaryEncoder', acc, std, time, sparsity, dimensions
        ])

        #HashingEncoder
        print("HashingEncoder Results:")
        acc, std, time, sparsity, dimensions = cv_binary_classification(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.HashingEncoder(return_df=False))
        results.append([
            type(model), 'HashingEncoder', acc, std, time, sparsity, dimensions
        ])

        #OneHotEncoder
        print("OneHotEncoder Results:")
        acc, std, time, sparsity, dimensions = cv_binary_classification(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=OneHotEncoder(handle_unknown='ignore', sparse=False))
        results.append([
            type(model), 'OneHotEncoder', acc, std, time, sparsity, dimensions
        ])

        #BetaEncoder (mean)
        print("Beta Encoder (mean) Results:")
        acc, std, time, sparsity, dimensions = cv_binary_classification(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=BetaEncoder(alpha=alpha_prior, beta=1 - alpha_prior))
        results.append([
            type(model), 'BetaEncoder (m)', acc, std, time, sparsity,
            dimensions
        ])

        #BetaEncoder (mean, variance)
        print("Beta Encoder (mean and variance Results:")
        acc, std, time, sparsity, dimensions = cv_binary_classification(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=BetaEncoder(alpha=alpha_prior, beta=1 - alpha_prior),
            moments='mv')
        results.append([
            type(model), 'BetaEncoder (mv)', acc, std, time, sparsity,
            dimensions
        ])

    file = 'adult_experiments.csv'
    with open(file, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerows(results)
    try:
        upload_file(file)
    except:
        print("File Not Uploaded")
# In[21]:

target_encoder.fit(df, y=df.loan_status)

# In[22]:

encoded_df = target_encoder.transform(df)

# In[23]:

encoded_df.head()

# In[24]:

ordinal_encoder = ce.OrdinalEncoder(cols=['term'])
ordinal_encoder.fit(encoded_df)
encoded_df = ordinal_encoder.transform(encoded_df)

# In[25]:

encoded_df.shape

# In[26]:

encoded_df.head()

# In[27]:

encoded_df.to_csv('../Processed Data/df_processed_categorical_v3.csv',
                  index=False)
Exemple #28
0
    y = df['dep_delayed_15min']

    return X, y

X, y = split_df(train)

print('Train Partition')
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2)

print('Building pipeline')
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', ce.OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['Distance']),
        ('cat', categorical_transformer, ['UniqueCarrier', 'Origin', 'Dest', 'Day_of_Week','year', 'month', 'day', 'hour', 'minutes'])])

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

param_grid = {'classifier__n_estimators': [400]}

print('Running Model')
CV = GridSearchCV(rf, param_grid, n_jobs= -1,scoring='roc_auc')
CV.fit(X_train, y_train)  
#print(CV.get_params())
    def test_ordinal(self):
        """

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        X = self.create_dataset(n_rows=1000)
        X_t = self.create_dataset(n_rows=100)
        X_t_extra = self.create_dataset(n_rows=100, extras=True)

        enc = encoders.OrdinalEncoder(verbose=1, cols=cols)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.OrdinalEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.OrdinalEncoder(verbose=1, drop_invariant=True)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.OrdinalEncoder(verbose=1, return_df=False)
        enc.fit(X, None)
        self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))

        enc = encoders.OrdinalEncoder(verbose=1,
                                      return_df=True,
                                      impute_missing=True,
                                      handle_unknown='impute')
        enc.fit(X, None)
        out = enc.transform(X_t_extra)
        self.assertEqual(len(set(out['D'].values)), 4)
        self.assertIn(0, set(out['D'].values))
        self.assertFalse(enc.mapping is None)
        self.assertTrue(len(enc.mapping) > 0)

        enc = encoders.OrdinalEncoder(verbose=1,
                                      mapping=enc.mapping,
                                      return_df=True,
                                      impute_missing=True,
                                      handle_unknown='impute')
        enc.fit(X, None)
        out = enc.transform(X_t_extra)
        self.assertEqual(len(set(out['D'].values)), 4)
        self.assertIn(0, set(out['D'].values))
        self.assertTrue(len(enc.mapping) > 0)

        enc = encoders.OrdinalEncoder(verbose=1,
                                      return_df=True,
                                      impute_missing=True,
                                      handle_unknown='ignore')
        enc.fit(X, None)
        out = enc.transform(X_t_extra)
        out_cats = [x for x in set(out['D'].values) if np.isfinite(x)]
        self.assertEqual(len(out_cats), 3)
        self.assertFalse(enc.mapping is None)

        enc = encoders.OrdinalEncoder(verbose=1,
                                      return_df=True,
                                      handle_unknown='error')
        enc.fit(X, None)
        with self.assertRaises(ValueError):
            out = enc.transform(X_t_extra)

        # test inverse_transform
        X = self.create_dataset(n_rows=1000, has_none=False)
        X_t = self.create_dataset(n_rows=100, has_none=False)
        X_t_extra = self.create_dataset(n_rows=100,
                                        extras=True,
                                        has_none=False)

        enc = encoders.OrdinalEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))
        self.verify_inverse_transform(
            X_t, enc.inverse_transform(enc.transform(X_t)))
        with self.assertRaises(ValueError):
            out = enc.inverse_transform(enc.transform(X_t_extra))
Exemple #30
0
happiness_sep_df = happiness_sep_df[happiness_sep_df['B.2.2'] != 'N.C.']
happiness_sep_df = happiness_sep_df[happiness_sep_df['B.2.2'] != 'N.S.']
happiness_sep_df = happiness_sep_df.dropna()

print(happiness_sep_df['B.2.1'].sort_values(ascending=True).unique())
print(happiness_sep_df['B.2.2'].sort_values(ascending=True).unique())

# # data munging :: encode ordinal features with category_encoder.OrdinalEncoder):
import category_encoders as ce
ordinals = pd.DataFrame({
    'situacion_actual': ['Muy Buena', 'Buena', 'Regular', 'Mala', 'Muy mala'],
    'valor': [2, 0, 1, 3, 4]
})
XB21 = ordinals.drop('valor', axis=1)
yB21 = ordinals.drop('situacion_actual', axis=1)
ce_ordB21 = ce.OrdinalEncoder(cols=['situacion_actual'])
ce_ordB21 = ce_ordB21.fit_transform(XB21, yB21['valor'])
print(ce_ordB21)

ceX21 = ce.OrdinalEncoder(happiness_sep_df['B.2.1'])
ceX21 = ceX21.fit_transform(happiness_sep_df['B.2.1'], yB21['valor'])
happiness_sep_df[
    'B.2.1_valor'] = ceX21  # seems bigger numbers for worse qualitative data ... ?

ceX22, yB22 = ce.OrdinalEncoder(happiness_sep_df['B.2.2']), yB21
ceX22 = ceX22.fit_transform(happiness_sep_df['B.2.2'], yB21['valor'])
happiness_sep_df[
    'B.2.2_valor'] = ceX22  # maintain same values (as it has to be)
print(happiness_sep_df)

# resulting dataframe: