예제 #1
0
 def _feature_encode(self, data):
     dummy_cols = []
     for col in data.cat_features:
         # merge categorical features with low frequencies
         if data.train_df[col].nunique() / len(data.train_df[col]) < 0.1:
             for name, count in data.train_df[col].value_counts().items():
                 if count / len(data.train_df[col]) < 0.01:
                     data.train_df[col].replace(name, 'Rare', inplace=True)
         if data.test_df[col].nunique() / len(data.test_df[col]) < 0.1:
             for name, count in data.test_df[col].value_counts().items():
                 if count / len(data.test_df[col]) < 0.01:
                     data.test_df[col].replace(name, 'Rare', inplace=True)
         # target-encode categorical features with high number of unique values
         if data.train_df[col].nunique() > 10:
             from category_encoders.target_encoder import TargetEncoder
             encoder = TargetEncoder(cols=col)
             encoder.fit(data.train_df[col], data.train_df[data.target_var])
             data.train_df[col] = encoder.transform(data.train_df[col])
             data.test_df[col] = encoder.transform(data.test_df[col])
         else:
             dummy_cols.append(col)
     # create dummy variables from categorical features with low number of unique values
     data.train_df = pd.get_dummies(data.train_df,
                                    columns=dummy_cols,
                                    drop_first=True)
     data.test_df = pd.get_dummies(data.test_df,
                                   columns=dummy_cols,
                                   drop_first=True)
     data.target_df = data.train_df[data.target_var]
def target_encoder(cols, train_set, train_y, test_set):
    # handle_unknown 和 handle_missing 被设定为 'value'
    # 在目标编码中,handle_unknown 和 handle_missing 仅接受 ‘error’, ‘return_nan’ 及 ‘value’ 设定
    # 两者的默认值均为 ‘value’, 即对未知类别或缺失值填充训练集的因变量平均值
    encoder = TargetEncoder(cols=cols,
                            handle_unknown='value',
                            handle_missing='value').fit(train_set, train_y)
    encoded_train = encoder.transform(train_set)  # 转换训练集
    encoded_test = encoder.transform(test_set)  # 转换测试集

    return encoded_train, encoded_test
예제 #3
0
def target_encode():
    from category_encoders.target_encoder import TargetEncoder
    tr = pd.read_csv('./data/tr.csv')
    te = pd.read_csv('./data/te.csv')
    y = tr['TARGET'].astype(int)
    tr.drop(['TARGET'], axis=1, inplace=True)

    encode_model = TargetEncoder(verbose=1, min_samples_leaf=100)

    cate_col = []
    for col in tr.columns:
        if tr[col].dtype == 'object':
            cate_col.append(col)

    encode_model.fit(tr, y)
    tr = encode_model.transform(tr)
    te = encode_model.transform(te)

    tr = tr[cate_col]
    te = te[cate_col]
    tr.columns = ['TE_' + col for col in cate_col]
    te.columns = ['TE_' + col for col in cate_col]
    print(tr.info())
    print(te.info())
    tr.to_csv("./data/target_tr.csv", index=False)
    te.to_csv("./data/target_te.csv", index=False)
예제 #4
0
    def target_encoder(cols, train_set, train_y, test_set):
        """
            特征无内在顺序,category数量 > 4
            Target encoding 采用 target mean value (among each category) 来给categorical feature做编码。
            handle_unknown 和 handle_missing 被设定为 'value'
            在目标编码中,handle_unknown 和 handle_missing 仅接受 ‘error’, ‘return_nan’ 及 ‘value’ 设定
            两者的默认值均为 ‘value’, 即对未知类别或缺失值填充训练集的因变量平均值
        """
        encoder = TargetEncoder(cols=cols,
                                handle_unknown='value',
                                handle_missing='value').fit(
                                    train_set, train_y)
        encoded_train = encoder.transform(train_set)  # 转换训练集
        encoded_test = encoder.transform(test_set)  # 转换测试集

        return encoded_train, encoded_test
def target_encoder(params):
    train = params[0].astype('str')
    test = params[1].astype('str')
    target = params[2]
    te = TargetEncoder(return_df=False)
    train = te.fit_transform(train.reshape(-1, 1), target.reshape(-1, 1))
    test = te.transform(test.reshape(-1, 1))
    return train.flatten(), test.flatten()
예제 #6
0
def encode_features(features, labels):
    """Encode categorical features with TargetEncoder"""

    features_columns = features.columns.values.tolist()

    start_time = time.time()
    enc = TargetEncoder(cols=features_columns,
                        return_df=True).fit(features, labels)
    encoded_features = enc.transform(features)
    print("--- %s seconds ---" % (time.time() - start_time))
    return encoded_features
예제 #7
0
class target_enc(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, df, y=None):
        self.encoder = TargetEncoder(handle_unknown='value', cols=self.columns)
        self.encoder = self.encoder.fit(df, y)
        return self

    def transform(self, df, y=None):
        df_ = df.copy()

        return self.encoder.transform(df_, y)
예제 #8
0
def target_encode(data, label, encoder=None):
    """

    :param data:
    :param label:
    :param encoder: if supplied the encoder will be used to predict onto data
    :return:
    """
    if encoder is None:
        encoder = TargetEncoder()
        data = encoder.fit_transform(data, label)
        return encoder, data
    else:
        return encoder, encoder.transform(data, label)
class DateTransformer(BaseEstimator, TransformerMixin):
    """Transforms DATE using target encoding of MONTH."""

    def __init__(self):
        self.encoder = None
        self.month = None

    def fit(self, X, y=None):
        self.month = X.apply(lambda x: x.apply(lambda y: str(y.month)))
        self.encoder = TargetEncoder().fit(self.month, y)
        return self

    def transform(self, X, y=None):
        months = X.apply(lambda x: x.apply(lambda y: str(y.month)))
        target_encoded_month = self.encoder.transform(months)
        return target_encoded_month
예제 #10
0
def _encode():
    train = pd.read_feather('./data/application_train.preprocessed.feather')
    test = pd.read_feather('./data/application_test.preprocessed.feather')
    df = pd.concat([train, test], sort=False).reset_index(drop=True)
    cols = [
        'CODE_GENDER',
        'FLAG_OWN_CAR',
        'FLAG_OWN_REALTY',
        'NAME_TYPE_SUITE',
        'NAME_INCOME_TYPE',
        'NAME_EDUCATION_TYPE',  # Level of highest education the client achieved,  # noqa
        'NAME_FAMILY_STATUS',
        'NAME_HOUSING_TYPE',
        'FLAG_MOBIL',
        'FLAG_EMP_PHONE',
        'FLAG_WORK_PHONE',
        'FLAG_CONT_MOBILE',
        'FLAG_PHONE',
        'FLAG_EMAIL',
        'OCCUPATION_TYPE',
        'WEEKDAY_APPR_PROCESS_START',
        'HOUR_APPR_PROCESS_START',
        'REG_REGION_NOT_LIVE_REGION',
        'REG_REGION_NOT_WORK_REGION',
        'LIVE_REGION_NOT_WORK_REGION',
        'REG_CITY_NOT_LIVE_CITY',
        'REG_CITY_NOT_WORK_CITY',
        'LIVE_CITY_NOT_WORK_CITY',
        'ORGANIZATION_TYPE',
        'FONDKAPREMONT_MODE',
        'HOUSETYPE_MODE',
        'WALLSMATERIAL_MODE',
        'EMERGENCYSTATE_MODE',
        'NAME_CONTRACT_TYPE',  # Identification if loan is cash or revolving,
    ]
    encoder = TargetEncoder(cols=cols)
    encoder.fit(df[cols], df['TARGET'])
    res = encoder.transform(df[cols])
    res.columns = ['{}_ENC'.format(c) for c in res.columns]
    res['SK_ID_CURR'] = df['SK_ID_CURR']
    res.to_feather('./data/app.enc.feather')
예제 #11
0
def target_encode_Stores(df, enc=None):
    """Target encode the Store variable using the category_encoders module

    Args:
        df: Data
        enc: Existing Encoder / if None retrain new encoder
    """

    target = df['Sales'].values
    stores = df['Store'].astype(str)

    if not enc:
        print("Fit TargetEncoder...")
        enc = TargetEncoder()
        new_store = enc.fit_transform(stores, target)
    else:
        print("Transform using existing TargetEncoder...")
        new_store = enc.transform(stores, target)

    df.loc[:, 'Store'] = new_store

    return new_store, enc
예제 #12
0
def target_encode_custom(df: pd.DataFrame, name: str, enc=None):
    """Target encode the Store variable using the category_encoders module

    Args:
        df: Data
        name (str): name of the column to encode
        enc: Existing Encoder / if None retrain new encoder
    """

    target = df['Sales'].values
    stores = df[name].astype(str)

    if not enc:
        print("Fit TargetEncoder...")
        enc = TargetEncoder()
        new_store = enc.fit_transform(stores, target)
    else:
        print("Transform using existing TargetEncoder...")
        new_store = enc.transform(stores, target)

    df.loc[:, name] = new_store

    return new_store, enc
                        ,reg_alpha=.1
                        ,reg_lambda=.1
                        )
    return lgbr

# 本地验证
kf = KFold(n_splits=10, shuffle=True, random_state=100)
devscore = []
for tidx, didx in kf.split(train.index):
    tf = train.iloc[tidx]
    df = train.iloc[didx]
    tt = target.iloc[tidx]
    dt = target.iloc[didx]
    te = TargetEncoder(cols=tecols)
    tf = te.fit_transform(tf, tt)
    df = te.transform(df)
    lgbr = makelgb()
    lgbr.fit(tf, tt)
    pre = lgbr.predict(df)
    fpr, tpr, thresholds = roc_curve(dt, pre)
    score = auc(fpr, tpr)
    devscore.append(score)
print(np.mean(devscore))

# # 在整个train集上重新训练,预测test,输出结果
# lgbr = makelgb()
# te = TargetEncoder(cols=tecols)
# tf = te.fit_transform(train, target)
# df = te.transform(test)
# lgbr.fit(tf, target)
# pre = lgbr.predict(df)
예제 #14
0
train_data.head()

from sklearn.model_selection import train_test_split

# n2 与 n3 重复性较高,删除 n2
train_data.drop(['n2', 'issueDate'], axis=1, inplace=True)
test_data = test_data[train_data.columns]

# 获取非数值列
s = train_data.dtypes
tecols = s[s == 'object'].index.tolist()

# 将非数值列直接利用TargetEncoder进行离散化编码
te = TargetEncoder(cols=tecols)
tf = te.fit_transform(train_data, target)
df = te.transform(test_data)

# 划分训练集和验证集
X_train_split, X_val, y_train_split, y_val = train_test_split(tf,
                                                              target,
                                                              test_size=0.2)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
"""使用优化后的参数初始化模型(参数通过网格搜索法进行优化,这里没有相关代码)"""
base_params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 14,
    'max_depth': 19,
예제 #15
0
def lin_model(labelled_data, unlabelled_data):
    """ Parameters: training dataframe, unknown dataframe
        Returns: results dataframe (Instance, Income)

        Drops NaN from training data,
        Replaces NaN in test data with ffill, 
        target-encodes non-numeric fields, 
        scales values,
        80/20 splits data to help verify model, 
        selects features using RFECV, with a lasso mode, cv set to 5,
        uses KNeighborRegressor for 11 nearest neighbours weighted to distance
    """
    print("cleaning data...")
    clean_labelled = labelled_data.dropna()
    clean_unlabelled = unlabelled_data[all_columns]
    # not ideal but fillna the mean freezes for some reason
    clean_unlabelled = clean_unlabelled.fillna(method="ffill") 
    # clean_unlabelled = clean_unlabelled.fillna("None")

    # remove some columns
    # clean_labelled = drop_columns(clean_labelled)
    # clean_unlabelled = drop_columns(clean_unlabelled)

    # print("one hot encoding data...")
    # One hot encoding
    # ohe = OneHotEncoder(
    #     categories="auto", 
    #     handle_unknown="ignore",
    #     sparse=False
    # )
    # clean_labelled = encode_training(ohe, clean_labelled)
    # clean_unlabelled = encode_testing(ohe, clean_unlabelled)

    clean_labelled = constrain_col_vals(clean_labelled)
    clean_unlabelled = constrain_col_vals(clean_unlabelled)
    unknown_data = clean_unlabelled.drop(["Instance"], axis=1)

    print("splitting data into train and test...")
    # 80/20 split
    split = split_data(clean_labelled)
    train_data, train_target, test_data, test_target = split

    print("target encoding data...")
    # Target encoding
    tar_encode = TargetEncoder()
    train_data = tar_encode.fit_transform(train_data, train_target)
    test_data = tar_encode.transform(test_data)
    unknown_data = tar_encode.transform(unknown_data)

    print("scaling values...")
    # scaling values
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    unknown_data = scaler.transform(unknown_data)

    print("selecting features...")
    # feature selection
    lasso = lm.Lasso()
    selector = RFECV(lasso, cv=5)
    train_data = selector.fit_transform(train_data, train_target)
    test_data = selector.transform(test_data)
    unknown_data = selector.transform(unknown_data)

    print("fitting model...")
    # fit model
    # lasso = lm.LassoCV(cv=5)
    # lasso.fit(train_data, train_target)
    neigh = KNeighborsRegressor(
        n_neighbors=11,
        weights="distance"
    )
    neigh.fit(train_data, train_target) 

    print("analysing test results...")
    # validate test
    test_result = neigh.predict(test_data)
    error = np.sqrt(mean_squared_error(test_target, test_result))
    variance = explained_variance_score(test_target, test_result)
    print("Root mean squared error of test data: ", error)
    print("Variance: ", variance)

    print("predicting unknown data...")
    # predict and format
    values = neigh.predict(unknown_data)
    results = pandas.DataFrame({
        "Instance": clean_unlabelled["Instance"].values,
        "Income": values.flatten()
    })
    print("Finished.")
    return results
예제 #16
0
def target_encoder(train, test, features, y):
    targetencoder = TargetEncoder(cols=features).fit(train.loc[:, features],
                                                     train[y])
    train_encoded = targetencoder.transform(train.loc[:, features], train[y])
    test_encoded = targetencoder.transform(test.loc[:, features])
    return (train_encoded, test_encoded, train[y], test[y])
#MEE_encoder = MEstimateEncoder()
#train_mee = MEE_encoder.fit_transform(train_set[feature_list], target)
#test_mee = MEE_encoder.transform(test_set[feature_list])
#print(train_mee.head())
X_train, X_val, y_train, y_val = train_test_split(train_set, target, test_size=0.2, random_state=97)
lr = LinearRegression()
rf = RandomForestRegressor()


# In[48]:


TE_encoder = TargetEncoder()
train_te = TE_encoder.fit_transform(train_set[feature_list], target)
test_te = TE_encoder.transform(test_set[feature_list])
#print(train_te.head())
encoder_list = [ TargetEncoder(), MEstimateEncoder()]
X_train, X_val, y_train, y_val = train_test_split(train_set, target, test_size=0.2, random_state=97)
#X_train, X_val, y_train, y_val = dataset_test()
lr = LinearRegression()

for encoder in encoder_list:
#    print("Test {} : ".format(str(encoder).split('(')[0]), end=" ")

    train_enc = encoder.fit_transform(X_train[feature_list], y_train)
      # test_enc = encoder.transform(test[feature_list])
    val_enc = encoder.transform(X_val[feature_list])
    
    lr.fit(train_enc, y_train)
#    print(lr.score(train_enc, y_train))
예제 #18
0
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols_to_keep = [
            'city_origin', 'host_total_listings_count', 'host_since',
            'latitude', 'amenities', 'longitude', 'room_type', 'accommodates',
            'bathrooms', 'beds', 'guests_included', 'minimum_nights',
            'number_of_reviews', 'review_scores_rating', 'cancellation_policy',
            'reviews_per_month', 'instant_bookable', 'property_type'
        ]
        self.num_na = [
            'host_total_listings_count', 'bathrooms', 'beds',
            'review_scores_rating', 'reviews_per_month'
        ]
        self.cat_na = ['host_since', 'property_type']
        self.amenities_to_keep = [
            'Well-lit path to entrance',
            'translation missing: en.hosting_amenity_50',
            'Paid parking on premises', 'No stairs or steps to enter',
            'Private living room', 'Self check-in', 'Pets allowed',
            'Free street parking', 'Buzzer/wireless intercom',
            'Free parking on premises', 'Extra pillows and blankets',
            'Dishwasher', 'Patio or balcony', 'Cable TV',
            'Luggage dropoff allowed', 'Smoking allowed',
            'Paid parking off premises', 'Carbon monoxide detector',
            'Internet', 'Long term stays allowed', 'Dryer', 'Microwave',
            'Host greets you', 'Lock on bedroom door', 'First aid kit',
            'Coffee maker', 'Oven', 'Private entrance', 'Family/kid friendly',
            'Fire extinguisher', 'Stove', 'Bed linens', 'Cooking basics',
            'Elevator', 'Dishes and silverware', 'Refrigerator',
            'Air conditioning', 'Smoke detector', 'Iron', 'Hot water',
            'Laptop friendly workspace', 'Shampoo', 'TV'
        ]
        self.inmputer = SimpleImputer()

    def fit(self, X_df, y=None):
        def regroup_cat(X, liste):
            if X not in liste:
                return ('other')
            else:
                return (X)

        self.prop_to_keep = [
            'Apartment', 'Serviced apartment', 'Condominium', 'Loft'
        ]
        self.prop_transformer = TargetEncoder()
        self.prop_transformer.fit(
            X_df['property_type'].apply(
                lambda x: regroup_cat(x, self.prop_to_keep)), y)

        self.pol_to_keep = [
            'flexible', 'strict_14_with_grace_period', 'moderate',
            'moderate_new'
        ]
        self.pol_transformer = TargetEncoder()
        self.pol_transformer.fit(
            X_df['cancellation_policy'].apply(
                lambda x: regroup_cat(x, self.pol_to_keep)), y)

        self.room_transformer = OrdinalEncoder()
        self.room_transformer.fit(X_df['room_type'])

        self.city_transformer = OneHotEncoder(handle_unknown='ignore')
        self.city_transformer.fit(pd.DataFrame(X_df['city_origin']))

        # numeric_transformer = Pipeline(steps = [('impute', SimpleImputer(strategy='median'))])

        return self

    def transform(self, X_df):
        def regroup_cat(X, liste):
            if X not in liste:
                return ('other')
            else:
                return (X)

        def replace_all(text, dic):
            for i, j in dic.items():
                text = text.replace(i, j)
            return text

        X_new = X_df[self.cols_to_keep].copy()

        #date
        X_new['host_since'] = pd.to_datetime(X_new['host_since'],
                                             format='%Y-%m-%d').dt.year

        #amenities
        amenities = X_new['amenities'].apply(
            lambda x: replace_all(x, {
                '{': '',
                '"': '',
                '}': ''
            })).str.get_dummies(sep=',')
        X_new = pd.merge(X_new,
                         amenities[self.amenities_to_keep],
                         left_index=True,
                         right_index=True)
        X_new.drop(['amenities'], axis=1, inplace=True)

        #fill missing
        X_new[self.num_na] = SimpleImputer().fit_transform(X_new[self.num_na])
        X_new[self.cat_na] = SimpleImputer(
            strategy='most_frequent').fit_transform(X_new[self.cat_na])

        #cat encoding
        ## concellation policy encoding
        X_new['cancellation_policy'] = self.pol_transformer.transform(
            X_new['cancellation_policy'].apply(
                lambda x: regroup_cat(x, self.pol_to_keep)))

        ## proprety type
        X_new['property_type'] = self.prop_transformer.transform(
            X_new['property_type'].apply(
                lambda x: regroup_cat(x, self.prop_to_keep)))

        ##room type
        X_new['room_type'] = self.room_transformer.transform(
            X_new['room_type'])

        ###city_origin_encoding
        X_new = pd.concat(
            [
                X_new.reset_index(drop=True).drop(['city_origin'], axis=1),
                pd.DataFrame(
                    self.city_transformer.transform(
                        pd.DataFrame(X_new['city_origin'])).toarray())
            ],
            axis=1)  #X_new.drop(['city_origin'], axis=1, inplace=True)

        #instant bookable
        X_new['instant_bookable'] = X_new['instant_bookable'].replace({
            "t": 1,
            "f": 0
        })

        return X_new
#test_numerics
test_data["Age"] = test_data["Age"].fillna(test_data['Age'].mean())
test_data["Year of Record"] = test_data["Year of Record"].fillna(
    test_data['Year of Record'].mean())

numerics_and_catagorics = list(train_data.columns)
X_train_data, X_val, y_train_data, y_val = train_test_split(train_data,
                                                            target,
                                                            test_size=0.2,
                                                            random_state=97)
random_forest = RandomForestRegressor()
#target_encoder
target_encoder = TargetEncoder()
train_data_encoded = target_encoder.fit_transform(
    X_train_data[numerics_and_catagorics], y_train_data)
test_data_encoded = target_encoder.transform(X_val[numerics_and_catagorics],
                                             y_val)
test_data = target_encoder.transform(test_data[numerics_and_catagorics])

print(train_data_encoded.head())
print(test_data_encoded.head())

#random_forest_regressor
r_train_data = random_forest.fit(train_data_encoded, y_train_data)
prediction = r_train_data.predict(test_data_encoded)


def rmse(prediction, target):
    #difference = prediction-target
    #square = **2
    #mean = .mean()
    #root = np.sqrt
def target_encode(X, X_test, cols, y):
    te = TargetEncoder(cols=cols, return_df=True)
    X = te.fit_transform(X, y)
    X_test = te.transform(X_test)
    return (X, X_test)
예제 #21
0
                         reg_alpha=.1,
                         reg_lambda=.1)
    return lgbr


# 本地验证
kf = KFold(n_splits=10, shuffle=True, random_state=100)
devscore = []
for tidx, didx in kf.split(train.index):
    tf = train.iloc[tidx]
    df = train.iloc[didx]
    tt = target.iloc[tidx]
    dt = target.iloc[didx]
    te = TargetEncoder(cols=tecols)
    tf = te.fit_transform(tf, tt)
    df = te.transform(df)
    lgbr = makelgb()
    lgbr.fit(tf, tt)
    pre = lgbr.predict(df)
    fpr, tpr, thresholds = roc_curve(dt, pre)
    score = auc(fpr, tpr)
    devscore.append(score)
print(np.mean(devscore))

# 在整个train集上重新训练,预测test,输出结果
lgbr = makelgb()
te = TargetEncoder(cols=tecols)
tf = te.fit_transform(train, target)
df = te.transform(test)
lgbr.fit(tf, target)
pre = lgbr.predict(df)