コード例 #1
0
def categoryEncode(df, cols=None, mode="binary"):
    if(mode == "ordinal"):
        encoder = OrdinalEncoder(cols=cols, handle_missing="return_nan", handle_unknown="return_nan")
    elif(mode == "binary"):
        encoder = BinaryEncoder(cols=cols)
    df_new = encoder.fit_transform(df)
    return df_new
コード例 #2
0
def encode_result(df_orig):
    df = df_orig.copy(deep=True)
    mapping_dict = {
        'col': 'result',
        'mapping': {
            'hwin': 1,
            'draw': 2,
            'awin': 3
        }
    }
    ord_enc = OrdinalEncoder(mapping=[mapping_dict])
    df['ordinal_result'] = ord_enc.fit_transform(df[['result']])
    return df
コード例 #3
0
    def _encode_categories(self):
        """
        This private method stands for encoding categorical variables. Label encoding used for ordinal categories and
        one-hot encoding used for nominal categories.
        """

        logging.info(f'#{self._index()} - Encoding categorical columns...')
        # get column names for categorical and numerical features
        categorical_vars = self.X.select_dtypes(include='object').columns
        numerical_vars = self.X.columns.difference(categorical_vars)

        ordinal = pd.Index([
            'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
        ])
        nominal = categorical_vars.difference(ordinal)

        standard_mapping = {
            'NA': 0,
            'Po': 1,
            'Fa': 2,
            'TA': 3,
            'Gd': 4,
            'Ex': 5
        }
        mapping_for_ordinals = [{
            'col': column,
            'mapping': standard_mapping
        } for column in ordinal]

        x_num = self.X[numerical_vars]
        x_test_num = self.X_test[numerical_vars]

        # one hot encode categorical columns
        one_hot_encoder = OneHotEncoder(use_cat_names=True)
        label_encoder = OrdinalEncoder(drop_invariant=True,
                                       mapping=mapping_for_ordinals,
                                       handle_unknown='error')

        x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal])
        x_cat_ord = label_encoder.fit_transform(self.X[ordinal])
        x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal])
        x_test_cat_ord = label_encoder.transform(self.X_test[ordinal])

        self.X = x_num.join(x_cat_ord).join(x_cat_nom)
        self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom)
        logging.info(f'#{self._step_index} - DONE!')
コード例 #4
0
def out_of_folds_predict(X, y):
    callbacks = [
        EarlyStopping(
            # Stop training when loss is no longer improving
            monitor="loss",
            # "no longer improving" being defined as "no better than 1e-2 less"
            min_delta=1e-5,
            # "no longer improving" being further defined as "for at least 2 epochs"
            patience=2,
            verbose=0,
        )
    ]

    preds = np.zeros(X.shape[0])

    n_splits = 4

    if y.sum() < 2:
        kfold = KFold(n_splits=n_splits)
    else:
        kfold = StratifiedKFold(n_splits=n_splits)

    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        print(f'Split {i+1} of {n_splits}...')
        pipe = build_pipe()

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        encoder = OrdinalEncoder()
        X_train = encoder.fit_transform(X_train, y_train).astype(np.float)

        pipe.fit(X_train, y_train, epochs=20, callbacks=callbacks, verbose=0)

        X_test = encoder.transform(X_test).astype(np.float)
        pipe.evaluate(X_test, y_test, verbose=1)

        preds[test_index] = pipe.predict(X_test).flatten()

    pipe = build_pipe()

    return preds
コード例 #5
0
    def convert_meta_to_dict(self):
        meta = self.meta[['productid'] + self.META_COLS].copy()

        # Encode to int
        encoder = OrdinalEncoder(cols=self.META_COLS)
        meta = encoder.fit_transform(meta)
        save_model(encoder, '{}/encoder'.format(MODEL_PATH))

        meta['values'] = meta.apply(get_dict_values,
                                    args=(self.META_COLS, ),
                                    axis=1)
        meta_dict = meta.set_index('productid')['values'].to_dict()
        meta_dict = {self.word2id[k]: v for k, v in meta_dict.items()}

        meta_counts_dict = (
            meta[self.META_COLS].max() +
            1).to_dict()  # Need to +1 to account for index starting from zero
        # Without +1 the embedding size will be insufficient by 1
        ordered_meta_counts_dict = OrderedDict()
        for col in ['product'] + self.META_COLS:
            ordered_meta_counts_dict[col] = meta_counts_dict.get(col, 0)

        return meta_dict, ordered_meta_counts_dict
コード例 #6
0
        preran=False,
        drop_original=True,
    )

LENGTH_ENCODE = False
if LENGTH_ENCODE:
    len_encode = ["URL"]
    for col in len_encode:
        X[f"{col}_len"] = X[col].apply(len)
        X = X.drop(col, axis=1)

CATEGORIZE = True
if CATEGORIZE:
    X[obj_cols] = X[obj_cols].astype("category")
    enc = OrdinalEncoder()
    X = enc.fit_transform(X)

DATE_ENCODE = False
if DATE_ENCODE:
    X = encode_dates(X, "date")

sns.displot(y)
plt.title("Distribution")
plt.show()

SEED = 0
SAMPLE_SIZE = 10000

Xt, Xv, yt, yv = train_test_split(
    X, y, random_state=SEED)  # split into train and validation set
dt = xgb.DMatrix(Xt, yt)
コード例 #7
0
#     OrdinalEncoder(),
#     LGBMRegressor()
# )

# pipe.fit(X_train, y_train)
# print('훈련 R^2: ', pipe.score(X_train, y_train))
# print('검증 R^2: ', pipe.score(X_val, y_val))
# print('TEST R^2: ', pipe.score(X_test, y_test))

# print('\n훈련 MAE: ', mean_absolute_error(pipe.predict(X_train), y_train))
# print('검증 MAE: ', mean_absolute_error(pipe.predict(X_val), y_val))
# print('TEST MAE: ', mean_absolute_error(pipe.predict(X_test), y_test))

## 인코딩
encoder = OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)
X_test_encoded = encoder.transform(X_test)

# ## 파라미터 튜닝
# lightGB = LGBMRegressor(learning_rate=0.01, max_depth=15, n_estimators=300000, num_leaves=250,
#                       random_state=1, reg_alpha=1, reg_lambda=1, subsample=0.7)

# eval_set = [(X_train_encoded, y_train),
#             (X_val_encoded,y_val),
#             (X_test_encoded, y_test)]

# lightGB.fit(X_train_encoded, y_train,
#           eval_set=eval_set,
#           early_stopping_rounds=1000,
#           eval_metric='mae',
コード例 #8
0
        'CoapplicantIncome': CoapplicantIncome,
        'LoanAmount': LoanAmount,
        'Loan_Amount_Term': Loan_Amount_Term
    }
    features = pd.DataFrame(data, index=[0])
    return features


df0 = user_input_features()

##################################################################################
st.write("Exibindo os dados de entrada", df0)

# Codificando e normalizando os dados de entrada
ordinal = OrdinalEncoder()
df0 = ordinal.fit_transform(df0)

stand = StandardScaler().fit(df0)
#df0 = stand.transform(df0)

##################################################################################

##################################################################################
# Carregando o dataset e separando as features e target
loan = pd.read_csv('emprestimo_app.csv')

if st.sidebar.checkbox("Mostrar todos os dados"):
    st.subheader("Exibindo todos os dados")
    st.write(loan)

X = loan.drop('Loan_Status', axis=1)
コード例 #9
0
application_train = pd.read_feather('../data/input/application_train.ftr')
application_test = pd.read_feather('../data/input/application_test.ftr')

train = application_train
test = application_test

categorical_columns = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
    'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'
]
enc = OrdinalEncoder(cols=categorical_columns, verbose=1)
train[categorical_columns] = enc.fit_transform(train[categorical_columns])
test[categorical_columns] = enc.transform(test[categorical_columns])

X_train = train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y_train = train.TARGET.values
X_test = test.drop(['SK_ID_CURR'], axis=1)

params = {
    'metric': ['auc'],
    'learning_rate': [0.1],
    'num_leaves': [i * 10 for i in range(2, 6)],
    'min_data_in_leaf': [5, 10, 15, 20],
    'random_state': [SEED],
    'verbose': [1]
}
コード例 #10
0
ファイル: encoder.py プロジェクト: liulingzhi604/dataprocess
class Encoder():
    encode_methods = {
        'OrdinalEncoder': OrdinalEncoder,
        'OneHotEncoder': OneHotEncoder,
        'CountEncoder': CountEncoder,
        'TargetEncoder': TargetEncoder,
    }

    # spark_encode_methods = {
    #     'mean_encoder':,
    #     'target_encoder':,
    #     'label_encoder':,
    #     'onehot_encoder'
    # }
    # target_encoder,mean_encoder在编码时,不能够把训练集和验证机concat在一起进行编码
    # label_encoder,onehot_encoder可以

    def __init__(self,
                 sparksess=None,
                 logdir='/encoder',
                 handle_unknown='-99999',
                 save_encoder=False):
        self.spark = sparksess
        self.logdir = logdir
        self.save_encoder

        self.ordinal_encoder_features = []
        self.onehot_encoder_features = []
        self.count_encoder_features = []
        self.target_encoder_features = []
        self.ordinal_encoder = OrdinalEncoder(
            cols=self.ordinal_encoder_features,
            return_df=True,
            handle_unknown=handle_unknown)
        self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
        self.count_encoder = CountEncoder(cols=self.count_encoder_features,
                                          return_df=True,
                                          handle_unknown=handle_unknown)
        self.target_encoder = TargetEncoder(cols=self.target_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)

    def fit(self,
            x_train,
            x_val=None,
            y_train=None,
            y_val=None,
            method_mapper=None):
        """
        Parameters
        ----------

        x_train: pd.DataFrame

        x_val: pd.DataFrame

        y_train: pd.DataFrame

        y_val: pd.DataFrame

        method_mapper: dict
            a mapping of feature to EncodeMethod
            example mapping: 
            {
                'feature1': OrdinalEncoder,
                'feature2': OneHotEncoder,
                'feature3': CountEncoder,
                'feature4': TargetEncoder,
            }
        """
        for feat in method_mapper:
            if method_mapper[feat] == 'OrdinalEncoder':
                self.ordinal_encoder_features.append(feat)
            elif method_mapper[feat] == 'OneHotEncoder':
                self.onehot_encoder_features.append(feat)
            elif method_mapper[feat] == 'CountEncoder':
                self.count_encoder_features.append(feat)
            elif method_mapper[feat] == 'TargetEncoder':
                self.target_encoder_features.append(feat)
            else:
                raise ValueError(
                    '编码方式只支持[OrdinalEncoder, OneHotEncoder, CountEncoder, TargetEncoder], 接收到%s'
                    % feat)

        if self.spark is None:
            if len(self.ordinal_encoder_features) != 0 or len(
                    self.onehot_encoder_features) != 0:
                x_whole = x_train.append(x_val)
                y_whole = None
                if not y_train is None and not y_val is None:
                    y_whole = y_train.append(y_val)

                x_whole = self.ordinal_encoder.fit_transform(x_whole, y_whole)
                x_whole = self.onehot_encoder.fit_transform(x_whole, y_whole)
                x_train = x_whole[:len(x_train)]
                x_val = x_whole[len(x_train):]

            x_train = self.count_encoder.fit_transform(x_train, y_train)
            x_val = self.count_encoder.transform(x_val, y_val)
            x_train = self.target_encoder.fit_transform(x_train, y_train)
            x_val = self.target_encoder.transform(x_val, y_val)

            if self.save_encoder:
                self.save_encoder()
        return x_train, y_train, x_val, y_val

    def transform(self, x, y=None):
        x = self.ordinal_encoder.transform(x, y)
        x = self.onehot_encoder.transform(x, y)
        x = self.count_encoder.transform(x, y)
        x = self.target_encoder.transform(x, y)
        return x, y

    def fit_transform(self,
                      x_train,
                      x_val=None,
                      y_train=None,
                      y_val=None,
                      method_mapper=None):
        """
        Parameters
        ----------

        x_train: pd.DataFrame

        x_val: pd.DataFrame

        y_train: pd.DataFrame

        y_val: pd.DataFrame
        
        method_mapper: dict
            a mapping of feature to EncodeMethod
            example mapping: 
            {
                'feature1': OrdinalEncoder,
                'feature2': OneHotEncoder,
                'feature3': CountEncoder,
                'feature4': TargetEncoder,
            }
        """
        self.fit(x_train, x_val, y_train, y_val, method_mapper)
        x_train, y_train = self.transform(x_train, y_train)
        if x_val is not None:
            x_val, y_val = self.transform(x_val, y_val)
        return x_train, y_train, x_val, y_val

    def save_encoder(self):
        now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))
        os.makedirs(os.path.join(self.logdir, now))

        with open(os.path.join(self.logdir, now, 'OrdinalEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.ordinal_encoder, f)
        with open(os.path.join(self.logdir, now, 'OneHotEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.onehot_encoder, f)
        with open(os.path.join(self.logdir, now, 'CountEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.count_encoder, f)
        with open(os.path.join(self.logdir, now, 'TargetEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.target_encoder, f)

        with open(
                os.path.join(self.logdir, now, 'OrdinalEncoderFeatures.json'),
                'w') as f:
            json.dump(self.ordinal_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'OneHotEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.onehot_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'CountEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.count_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'TargetEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.target_encoder_features, f)

    def load_encoder(self, logdir=None):
        with open(os.path.join(self.logdir, 'OrdinalEncoder.pkl'), 'wb') as f:
            pickle.dump(self.ordinal_encoder, f)
        with open(os.path.join(self.logdir, 'OneHotEncoder.pkl'), 'wb') as f:
            pickle.dump(self.onehot_encoder, f)
        with open(os.path.join(self.logdir, 'CountEncoder.pkl'), 'wb') as f:
            pickle.dump(self.count_encoder, f)
        with open(os.path.join(self.logdir, 'TargetEncoder.pkl'), 'wb') as f:
            pickle.dump(self.target_encoder, f)

        with open(os.path.join(self.logdir, 'OrdinalEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.ordinal_encoder_features, f)
        with open(os.path.join(self.logdir, 'OneHotEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.onehot_encoder_features, f)
        with open(os.path.join(self.logdir, 'CountEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.count_encoder_features, f)
        with open(os.path.join(self.logdir, 'TargetEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.target_encoder_features, f)