예제 #1
0
 def _feature_encode(self, data):
     dummy_cols = []
     for col in data.cat_features:
         # merge categorical features with low frequencies
         if data.train_df[col].nunique() / len(data.train_df[col]) < 0.1:
             for name, count in data.train_df[col].value_counts().items():
                 if count / len(data.train_df[col]) < 0.01:
                     data.train_df[col].replace(name, 'Rare', inplace=True)
         if data.test_df[col].nunique() / len(data.test_df[col]) < 0.1:
             for name, count in data.test_df[col].value_counts().items():
                 if count / len(data.test_df[col]) < 0.01:
                     data.test_df[col].replace(name, 'Rare', inplace=True)
         # target-encode categorical features with high number of unique values
         if data.train_df[col].nunique() > 10:
             from category_encoders.target_encoder import TargetEncoder
             encoder = TargetEncoder(cols=col)
             encoder.fit(data.train_df[col], data.train_df[data.target_var])
             data.train_df[col] = encoder.transform(data.train_df[col])
             data.test_df[col] = encoder.transform(data.test_df[col])
         else:
             dummy_cols.append(col)
     # create dummy variables from categorical features with low number of unique values
     data.train_df = pd.get_dummies(data.train_df,
                                    columns=dummy_cols,
                                    drop_first=True)
     data.test_df = pd.get_dummies(data.test_df,
                                   columns=dummy_cols,
                                   drop_first=True)
     data.target_df = data.train_df[data.target_var]
예제 #2
0
def target_encode():
    from category_encoders.target_encoder import TargetEncoder
    tr = pd.read_csv('./data/tr.csv')
    te = pd.read_csv('./data/te.csv')
    y = tr['TARGET'].astype(int)
    tr.drop(['TARGET'], axis=1, inplace=True)

    encode_model = TargetEncoder(verbose=1, min_samples_leaf=100)

    cate_col = []
    for col in tr.columns:
        if tr[col].dtype == 'object':
            cate_col.append(col)

    encode_model.fit(tr, y)
    tr = encode_model.transform(tr)
    te = encode_model.transform(te)

    tr = tr[cate_col]
    te = te[cate_col]
    tr.columns = ['TE_' + col for col in cate_col]
    te.columns = ['TE_' + col for col in cate_col]
    print(tr.info())
    print(te.info())
    tr.to_csv("./data/target_tr.csv", index=False)
    te.to_csv("./data/target_te.csv", index=False)
예제 #3
0
def fit_target_encoder(train_imputed_categorical_df: pd.DataFrame,
                       train_transformed_target: pd.DataFrame):
    target_encoder = TargetEncoder(
        cols=train_imputed_categorical_df.columns.values)

    target_encoder.fit(X=train_imputed_categorical_df,
                       y=train_transformed_target)
    return target_encoder
예제 #4
0
def label_encoding_fit(X, y, cols):
    '''Label - Takes X_train, y_train, columns to be encoded, saves encoded files '''
    for col in cols:
        print("Encoding for column: {}".format(col))
        encoder = TargetEncoder(cols=[col])
        encoder.fit(X[col], y)
        write_encoder(encoder, 'label', col)
    return
예제 #5
0
def _encode():
    train = pd.read_feather('./data/application_train.preprocessed.feather')
    test = pd.read_feather('./data/application_test.preprocessed.feather')
    df = pd.concat([train, test], sort=False).reset_index(drop=True)
    cols = [
        'CODE_GENDER',
        'FLAG_OWN_CAR',
        'FLAG_OWN_REALTY',
        'NAME_TYPE_SUITE',
        'NAME_INCOME_TYPE',
        'NAME_EDUCATION_TYPE',  # Level of highest education the client achieved,  # noqa
        'NAME_FAMILY_STATUS',
        'NAME_HOUSING_TYPE',
        'FLAG_MOBIL',
        'FLAG_EMP_PHONE',
        'FLAG_WORK_PHONE',
        'FLAG_CONT_MOBILE',
        'FLAG_PHONE',
        'FLAG_EMAIL',
        'OCCUPATION_TYPE',
        'WEEKDAY_APPR_PROCESS_START',
        'HOUR_APPR_PROCESS_START',
        'REG_REGION_NOT_LIVE_REGION',
        'REG_REGION_NOT_WORK_REGION',
        'LIVE_REGION_NOT_WORK_REGION',
        'REG_CITY_NOT_LIVE_CITY',
        'REG_CITY_NOT_WORK_CITY',
        'LIVE_CITY_NOT_WORK_CITY',
        'ORGANIZATION_TYPE',
        'FONDKAPREMONT_MODE',
        'HOUSETYPE_MODE',
        'WALLSMATERIAL_MODE',
        'EMERGENCYSTATE_MODE',
        'NAME_CONTRACT_TYPE',  # Identification if loan is cash or revolving,
    ]
    encoder = TargetEncoder(cols=cols)
    encoder.fit(df[cols], df['TARGET'])
    res = encoder.transform(df[cols])
    res.columns = ['{}_ENC'.format(c) for c in res.columns]
    res['SK_ID_CURR'] = df['SK_ID_CURR']
    res.to_feather('./data/app.enc.feather')
예제 #6
0
class target_enc(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, df, y=None):
        self.encoder = TargetEncoder(handle_unknown='value', cols=self.columns)
        self.encoder = self.encoder.fit(df, y)
        return self

    def transform(self, df, y=None):
        df_ = df.copy()

        return self.encoder.transform(df_, y)
예제 #7
0
파일: CrawtoDS.py 프로젝트: crawftv/crawto
 def target_encoder(self):
     te = TargetEncoder(
         cols=self.train_imputed_categorical_df.columns.values)
     te.fit(X=self.train_imputed_categorical_df,
            y=self.train_transformed_target)
     return te
예제 #8
0
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols_to_keep = [
            'city_origin', 'host_total_listings_count', 'host_since',
            'latitude', 'amenities', 'longitude', 'room_type', 'accommodates',
            'bathrooms', 'beds', 'guests_included', 'minimum_nights',
            'number_of_reviews', 'review_scores_rating', 'cancellation_policy',
            'reviews_per_month', 'instant_bookable', 'property_type'
        ]
        self.num_na = [
            'host_total_listings_count', 'bathrooms', 'beds',
            'review_scores_rating', 'reviews_per_month'
        ]
        self.cat_na = ['host_since', 'property_type']
        self.amenities_to_keep = [
            'Well-lit path to entrance',
            'translation missing: en.hosting_amenity_50',
            'Paid parking on premises', 'No stairs or steps to enter',
            'Private living room', 'Self check-in', 'Pets allowed',
            'Free street parking', 'Buzzer/wireless intercom',
            'Free parking on premises', 'Extra pillows and blankets',
            'Dishwasher', 'Patio or balcony', 'Cable TV',
            'Luggage dropoff allowed', 'Smoking allowed',
            'Paid parking off premises', 'Carbon monoxide detector',
            'Internet', 'Long term stays allowed', 'Dryer', 'Microwave',
            'Host greets you', 'Lock on bedroom door', 'First aid kit',
            'Coffee maker', 'Oven', 'Private entrance', 'Family/kid friendly',
            'Fire extinguisher', 'Stove', 'Bed linens', 'Cooking basics',
            'Elevator', 'Dishes and silverware', 'Refrigerator',
            'Air conditioning', 'Smoke detector', 'Iron', 'Hot water',
            'Laptop friendly workspace', 'Shampoo', 'TV'
        ]
        self.inmputer = SimpleImputer()

    def fit(self, X_df, y=None):
        def regroup_cat(X, liste):
            if X not in liste:
                return ('other')
            else:
                return (X)

        self.prop_to_keep = [
            'Apartment', 'Serviced apartment', 'Condominium', 'Loft'
        ]
        self.prop_transformer = TargetEncoder()
        self.prop_transformer.fit(
            X_df['property_type'].apply(
                lambda x: regroup_cat(x, self.prop_to_keep)), y)

        self.pol_to_keep = [
            'flexible', 'strict_14_with_grace_period', 'moderate',
            'moderate_new'
        ]
        self.pol_transformer = TargetEncoder()
        self.pol_transformer.fit(
            X_df['cancellation_policy'].apply(
                lambda x: regroup_cat(x, self.pol_to_keep)), y)

        self.room_transformer = OrdinalEncoder()
        self.room_transformer.fit(X_df['room_type'])

        self.city_transformer = OneHotEncoder(handle_unknown='ignore')
        self.city_transformer.fit(pd.DataFrame(X_df['city_origin']))

        # numeric_transformer = Pipeline(steps = [('impute', SimpleImputer(strategy='median'))])

        return self

    def transform(self, X_df):
        def regroup_cat(X, liste):
            if X not in liste:
                return ('other')
            else:
                return (X)

        def replace_all(text, dic):
            for i, j in dic.items():
                text = text.replace(i, j)
            return text

        X_new = X_df[self.cols_to_keep].copy()

        #date
        X_new['host_since'] = pd.to_datetime(X_new['host_since'],
                                             format='%Y-%m-%d').dt.year

        #amenities
        amenities = X_new['amenities'].apply(
            lambda x: replace_all(x, {
                '{': '',
                '"': '',
                '}': ''
            })).str.get_dummies(sep=',')
        X_new = pd.merge(X_new,
                         amenities[self.amenities_to_keep],
                         left_index=True,
                         right_index=True)
        X_new.drop(['amenities'], axis=1, inplace=True)

        #fill missing
        X_new[self.num_na] = SimpleImputer().fit_transform(X_new[self.num_na])
        X_new[self.cat_na] = SimpleImputer(
            strategy='most_frequent').fit_transform(X_new[self.cat_na])

        #cat encoding
        ## concellation policy encoding
        X_new['cancellation_policy'] = self.pol_transformer.transform(
            X_new['cancellation_policy'].apply(
                lambda x: regroup_cat(x, self.pol_to_keep)))

        ## proprety type
        X_new['property_type'] = self.prop_transformer.transform(
            X_new['property_type'].apply(
                lambda x: regroup_cat(x, self.prop_to_keep)))

        ##room type
        X_new['room_type'] = self.room_transformer.transform(
            X_new['room_type'])

        ###city_origin_encoding
        X_new = pd.concat(
            [
                X_new.reset_index(drop=True).drop(['city_origin'], axis=1),
                pd.DataFrame(
                    self.city_transformer.transform(
                        pd.DataFrame(X_new['city_origin'])).toarray())
            ],
            axis=1)  #X_new.drop(['city_origin'], axis=1, inplace=True)

        #instant bookable
        X_new['instant_bookable'] = X_new['instant_bookable'].replace({
            "t": 1,
            "f": 0
        })

        return X_new