def run(dataset_version, params):
    train, val, test = load_data(dataset_version)

    X = train.drop(columns='target_pct_vunerable')
    y = train.target_pct_vunerable

    # Will use this as local val score and compare with CV score
    X_val = val.drop(columns='target_pct_vunerable')
    y_val = val.target_pct_vunerable

    X_test = test.copy()

    # Create categorical encoder
    cat_cols = X.select_dtypes('object').columns.tolist()
    enc = TargetEncoder(cols=cat_cols)

    # Tune no. estimators on validation set
    X_train = enc.fit_transform(X, y)
    X_val = enc.transform(X_val)

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train,
              y,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              verbose=25,
              early_stopping_rounds=50)

    params.update({'n_estimators': model.best_iteration_})

    # Combine validation set back with train set
    data = pd.concat([train, val], axis=0, sort=False)

    X = data.drop(columns='target_pct_vunerable')
    y = data.target_pct_vunerable

    X = enc.fit_transform(X, y)

    model = lgb.LGBMRegressor(**params)
    model.fit(X, y)

    # Make a submission file
    X_test = enc.transform(X_test)

    test_preds = model.predict(X_test)

    sub = pd.DataFrame({'ward': X_test.index, y.name: test_preds})

    now = datetime.now()
    fname = f'lgbm_{data_version}_{now.year}-{now.month}-{now.day}--{now.hour}-{now.minute}.csv'
    fname

    sub.to_csv('../data/submissions/lgbm_best_reproduce.csv', index=False)
Example #2
0
def prepare_df(df, columns, target):
    ''' Prepares a pd.DataFrame by turning missing scikit-learn preprocessors into "None" strings and
            performs target encoding at the input columns.

    Parameters:
    -----------
    df: pd.DataFrame
        Contains a pd.DataFrame with the generated meta-data.
    columns: list
        Contains a list with the columns that contain scikit-learn estimators and scikit-learn preprocessors.
    target: str
        Contains a string that represents the name of the column that is the target of the dataset.
    Returns:
    --------
    pd.DataFrame
        Contains adjusted pd.DataFrame.
    '''
    df = deepcopy(df)
    df = df.reset_index(drop=True)
    df = df.drop_duplicates()
    y = df[target]

    for column in ['component_1', 'component_2', 'component_3']:
        df[column] = df[column].apply(lambda x: nan_to_none(x))

    for column in columns:
        df[column] = df[column].astype('category')
        df['{}_codes'.format(column)] = df[column].cat.codes
        enc = TargetEncoder(cols=[column])
        df['{}_encoded'.format(column)] = enc.fit_transform(df[column], y)

    return df
Example #3
0
    def getTestTrainSlipt(self):
        ## If both testX and testTrainSplit are not passed throw exception.
        if ((self.testX is None) and (self.testTrainSplit is None)):
            raise Exception("Please pass testX or testTrainSplit")

        if (self.targetEncodeCols):
            for col in self.targetEncodeCols:
                encoder = TargetEncoder()
                self.X[col] = encoder.fit_transform(self.X[col])
                if (self.testX):
                    self.testX[col] = encoder.fit_transform(self.testX[col])

        if (self.testTrainSplit):
            X_train, X_test, y_train, y_test = train_test_split(
                self.X, self.Y, test_size=self.testTrainSplit, random_state=7)
            return X_train, X_test, y_train, y_test
        else:
            return self.X, self.testX, self.Y, self.testY
Example #4
0
    def getTestTrainSlipt(self):
        ## If both testX and testTrainSplit are not passed throw exception.
        if ((self.testX is None) and (self.testTrainSplit is None)):
            raise Exception("Please pass testX or testTrainSplit")

        ## If targetEncodeCols is given first target encode them.
        if (self.targetEncodeCols):
            for col in self.targetEncodeCols:
                encoder = TargetEncoder()
                self.X[col] = encoder.fit_transform(self.X[col], self.Y)
                if(self.testX and self.testY):
                    self.testX[col] = encoder.fit_transform(self.testX[col], self.testY)

        if ((self.testX is not None) and (self.testTrainSplit is None)):
            return self.X, self.testX, self.Y, self.testY

        ## If stratify, smote and testTrainSplits are not passed, then just return.
        if (not self.stratify and not self.applySmote and not self.testTrainSplit):
            return self.X, self.testX, self.Y, self.testY

        # If startify flag is passed then stratify it using Y variable.
        startifyVar = self.Y if self.stratify else None
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.Y, stratify=startifyVar,
                                                            test_size=self.testTrainSplit, random_state = 7)
        if (not self.applySmote and not self.underSample):
            return X_train, X_test, y_train, y_test
        else:
            X_train = X_train if self.testTrainSplit is not None else self.X
            y_train = y_train if self.testTrainSplit is not None else self.Y
            X_test = X_test if self.testTrainSplit is not None else self.testX
            y_test = y_test if self.testTrainSplit is not None else self.testY
            if (self.applySmote):
                sm = SMOTE(sampling_strategy=self.sampling)
                X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
                return X_train_res, X_test, y_train_res, y_test
            if (self.underSample):
                underSampler = RandomUnderSampler(sampling_strategy=self.sampling)
                X_train_res, y_train_res = underSampler.fit_sample(X_train, y_train)
                return X_train_res, X_test, y_train_res, y_test
Example #5
0
    def transform(self, X):

        if self.aliases:
            X[self.aliases] = X[self.cols]
            self.cols = self.aliases

        t_enc = TargetEncoder(cols=self.cols)
        X = t_enc.fit_transform(X, X[self.target_col])
        if not self.ordinal_transform:
            return X

        o_enc = OrdinalEncoder()
        X[self.cols] = o_enc.fit_transform(X[self.cols])
        return X
class EntityEmbeddingTree(BaseEstimator, TransformerMixin):
    def __init__(self, *, numeric_columns, categorical_columns):
        self.__numeric_columns = numeric_columns
        self.__categorical_columns = categorical_columns
        self.__target_encoder, self.__one_hot_encoder = [
            None for _ in range(2)
        ]
        self.__max_target, self.__max_param = [None for _ in range(2)]
        self.__clf = None

    def fit(self, X, y):
        X = X.copy(deep=True)
        y = y.copy(deep=True)

        self.__target_encoder = TargetEncoder()
        X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0)
        X[self.__categorical_columns] = X[self.__categorical_columns].fillna(
            "missing").astype(str)
        X[self.__categorical_columns] = self.__target_encoder.fit_transform(
            X[self.__categorical_columns], y)

        self.__max_target, self.__max_param = optimize_rf(X, y)
        self.__clf = RandomForestClassifier(
            min_samples_leaf=max(
                min(self.__max_param["min_samples_leaf"], 1.0), 0),
            n_estimators=max(int(round(self.__max_param["n_estimators"])), 1))

        self.__clf.fit(X, y)
        gc.collect()

        return self

    def transform(self, X):
        X = X.copy(deep=True)

        X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0)
        X[self.__categorical_columns] = X[self.__categorical_columns].fillna(
            "missing").astype(str)
        X[self.__categorical_columns] = self.__target_encoder.transform(
            X[self.__categorical_columns])
        gc.collect()

        return pd.DataFrame(self.__clf.apply(X)).astype(str)

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X=X, y=y)

        return self.transform(X)
Example #7
0

from category_encoders import TargetEncoder


# In[189]:


new_df


# In[197]:


encoder = TargetEncoder()
encoder.fit_transform(new_df['Sex'],new_df['Survived'])


# In[198]:


encoder = TargetEncoder()
encoder.fit_transform(new_df['Embarked'],new_df['Survived'])


# In[ ]:




Example #8
0
    # 单词特征的特征散列化
    def hash_features(word_list, m):
        output = [0] * m
        for word in word_list:
            index = hash_fcn(word) % m
            output[index] += 1
        return output

    # 带符号的特征散列化
    def hash_features(word_list, m):
        output = [0] * m
        for word in word_list:
            index = hash_fcn(word) % m
            sign_bit = sign_hash(word) % 2
            if sign_bit == 0:
                output[index] -= 1
            else:
                output[index] += 1
        return output

    h = FeatureHasher(n_features=m, input_type="string")
    f = h.trasnform(df["feat"])

    enc = TargetEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)

    enc = LeaveOneOutEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)

    enc = WOEEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)
Exited
0    7963
1    2037
Name: Geography, dtype: int64
'''

###############################################################################
#                         3. Data Preprocessing                               #
###############################################################################

# Encoding Categorical Variables
l = LabelEncoder()
df['Gender'] = l.fit_transform(df['Gender'])

encoder = TargetEncoder()
df['country'] = encoder.fit_transform(df['Geography'], df['Exited'])

df.drop(['Geography'], inplace = True, axis = 1)

# Spliting into dependent and independent vectors
x = df.drop(['Exited'], axis = 1)
y = df.Exited

# y = y.values.reshape(-1,1)

# Standard Scaling
S = StandardScaler()
x = S.fit_transform(x)

###############################################################################
#         4. Splitting the dataset into training set and test set             #
    X, y, stratify=y, random_state=42
)


pd.Series(train_mod[['Crop_Type','Crop_Damage']].groupby(['Crop_Type']).count()/88858)


clf.fit(X_train, y_train)


submission = sample.copy()
submission['Crop_Damage'] = bc.predict(test_x)
submission.to_csv('bc1.csv',index = False)


from category_encoders import TargetEncoder
encoder = TargetEncoder()
t = encoder.fit_transform(train_mod['Crop_Type'], train_mod['Crop_Damage'])


from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(base_estimator =XGBClassifier(750),
                  n_estimators = 15,
                  verbose= 20,
                  bootstrap = False,
                  max_features = 1
                )

bc.fit(X, y)
Example #11
0
}

with timer('training'):
    cv_results = []
    val_series = y_train.copy()
    test_df = pd.DataFrame()
    feat_df = pd.DataFrame(index=X_train.columns)
    for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_trn = X_train.iloc[trn_idx]
        y_trn = y_train[trn_idx]
        X_val = X_train.iloc[val_idx]
        y_val = y_train[val_idx]
        print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30)
        with timer('target encoding'):
            te = TargetEncoder()
            X_trn = te.fit_transform(X_trn, y_trn)
            X_val = te.transform(X_val)
            X_test_ = te.transform(X_test)
            X_trn.fillna(-9999)
            X_val.fillna(-9999)
            X_test_.fillna(-9999)
        
        with timer('fit'):
            model = lgb.LGBMClassifier(**lgb_params)
            model.fit(X_trn, y_trn, eval_set=[(X_trn, y_trn), (X_val, y_val)], **fit_params)
        
        p = model.predict_proba(X_val)[:, 1]
        val_series.iloc[val_idx] = p
        cv_results.append(roc_auc_score(y_val, p))
        test_df[i] = model.predict_proba(X_test_)[:, 1]
        feat_df[i] = model.feature_importances_
train_X

catCols = [
    cname for cname in train_X.columns if train_X[cname].dtype == 'object'
]
catCols
train_X_cat = train_X[catCols].copy()
val_X_cat = val_X[catCols].copy()

from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
simple_imputer = SimpleImputer(strategy='most_frequent')

target_encoder = TargetEncoder()

train_X_targetenc = target_encoder.fit_transform(train_X_cat, train_y)
val_X_targetenc = target_encoder.transform(val_X_cat)
train_X_labelenc


def score_dataset(X_train, X_valid, y_train, y_valid):
    model = LinearRegression()
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return np.sqrt(metrics.mean_squared_error(y_valid, preds))


simple_imputer = SimpleImputer()
numCols = [
    cname for cname in train_X.columns if train_X[cname].dtype != "object"
]
"""Encoding categorical variables"""

!pip install --upgrade category_encoders

#encoding categorical data Gender
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
train.loc[:,'Gender'] = l.fit_transform(train.loc[:,'Gender'])
# train.loc[:, '12th Completion year'] = l.fit_transform(train.loc[:, '12th Completion year'])
# train.loc[:, '10th Completion Year'] = l.fit_transform(train.loc[:, '10th Completion Year'])

train.loc[:,'Performance']=l.fit_transform(train.loc[:,'Performance'])

from category_encoders import TargetEncoder
encoder = TargetEncoder()
train['Specialization in study'] = encoder.fit_transform(train['Specialization in study'], train['Performance'])
# train['10Y'] = encoder.fit_transform(train['10th Completion Year'], train['Performance'])
# train['12Y'] = encoder.fit_transform(train['12th Completion year'], train['Performance'])

encoder = TargetEncoder()
train['Year of Completion of college'] = encoder.fit_transform(train['Year of Completion of college'], train['Performance'])

encoder = TargetEncoder()
train['12th Completion year'] = encoder.fit_transform(train['12th Completion year'], train['Performance'])

encoder = TargetEncoder()
train['10th Completion Year'] = encoder.fit_transform(train['10th Completion Year'], train['Performance'])

train.head(5)

train.describe()
Example #14
0
pd_data = pd_data.drop(columns=['XINGBIE', 'HYZK'])

scaler_3 = TargetEncoder(cols=['ZHIYE', 'DWJJLX', 'DWSSHY'])

# print(pd_data.head())
#
# print(pd_data.isnull().sum())
#
# exit(0)

train_data = pd_data[pd_data['id'].isin(train_data_ids)]
test_data = pd_data[~pd_data['id'].isin(train_data_ids)]

# exit(0)

train_data = scaler_3.fit_transform(train_data, train_data['label'])
test_data = scaler_3.transform(test_data)

# print(train_data.columns)
features = [col for col in train_data.columns if col not in del_columns]

# exit(0)
x_train = np.array(train_data[features])
y_train = np.array(train_data['label'])
x_test = np.array(test_data[features])

folds = StratifiedKFold(n_splits=6, shuffle=True, random_state=111)
kfolds = folds.split(x_train, y_train)

oof_lgb = np.zeros((len(train_data), 2))
predictions_lgb = np.zeros((len(test_data), 2))
Example #15
0
            'temp_min3', 'brzina_vjetra3', 'tlak_zraka3', 'oblaci_pokrice3', 'oborine_mogucnost3', 'temp_prosjek7', 'temp_max7', 'temp_min7', 'brzina_vjetra7', 'tlak_zraka7', 'oblaci_pokrice7', 'oborine_mogucnost7', 'index_vrucine1', 'index_vrucine3', 'index_vrucine7',]
kategoricke = ['nacin_rezervacije', 'status_rezervacije', 'vrsta_sobe', 'kanal', 'prognoza3', 'prognoza1', 'prognoza7', 'sif_usluge', 'tip_ro', 'tip_garancije', 'lead_time_dani']

print(len(numericke))
print(len(kategoricke))
# SIMPLE IMPUTER za null i vrijednosti koje nedostaju
num_pipeline = Pipeline([('impute', SimpleImputer(strategy='mean'))])
kat_pipeline = Pipeline([('impute', SimpleImputer(strategy='most_frequent'))])

final_pipeline = ColumnTransformer([('continuous', num_pipeline, numericke), ('cat', kat_pipeline, kategoricke)], remainder='passthrough')
X_imputed = final_pipeline.fit_transform(X,y)
print(type(X_imputed))

# TARGET ENCODING KATEGORIČKIH VARIJABLI
te = TargetEncoder()
X_kodirano = te.fit_transform(X_imputed, y)

skalar = MinMaxScaler()
X_fit = skalar.fit_transform(X_kodirano,y)

rfc = XGBClassifier()
rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(5), scoring='f1', min_features_to_select=1, verbose=1)
rfecv.fit(X_fit, y)

print('Optimalan broj značajki je: {}'.format(rfecv.n_features_))
print(np.where(rfecv.support_ == False)[0])
X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)


plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination sa unakrsnom validacijom', fontsize=18, fontweight='bold', pad=20)
Example #16
0
class Encoder():
    encode_methods = {
        'OrdinalEncoder': OrdinalEncoder,
        'OneHotEncoder': OneHotEncoder,
        'CountEncoder': CountEncoder,
        'TargetEncoder': TargetEncoder,
    }

    # spark_encode_methods = {
    #     'mean_encoder':,
    #     'target_encoder':,
    #     'label_encoder':,
    #     'onehot_encoder'
    # }
    # target_encoder,mean_encoder在编码时,不能够把训练集和验证机concat在一起进行编码
    # label_encoder,onehot_encoder可以

    def __init__(self,
                 sparksess=None,
                 logdir='/encoder',
                 handle_unknown='-99999',
                 save_encoder=False):
        self.spark = sparksess
        self.logdir = logdir
        self.save_encoder

        self.ordinal_encoder_features = []
        self.onehot_encoder_features = []
        self.count_encoder_features = []
        self.target_encoder_features = []
        self.ordinal_encoder = OrdinalEncoder(
            cols=self.ordinal_encoder_features,
            return_df=True,
            handle_unknown=handle_unknown)
        self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
        self.count_encoder = CountEncoder(cols=self.count_encoder_features,
                                          return_df=True,
                                          handle_unknown=handle_unknown)
        self.target_encoder = TargetEncoder(cols=self.target_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)

    def fit(self,
            x_train,
            x_val=None,
            y_train=None,
            y_val=None,
            method_mapper=None):
        """
        Parameters
        ----------

        x_train: pd.DataFrame

        x_val: pd.DataFrame

        y_train: pd.DataFrame

        y_val: pd.DataFrame

        method_mapper: dict
            a mapping of feature to EncodeMethod
            example mapping: 
            {
                'feature1': OrdinalEncoder,
                'feature2': OneHotEncoder,
                'feature3': CountEncoder,
                'feature4': TargetEncoder,
            }
        """
        for feat in method_mapper:
            if method_mapper[feat] == 'OrdinalEncoder':
                self.ordinal_encoder_features.append(feat)
            elif method_mapper[feat] == 'OneHotEncoder':
                self.onehot_encoder_features.append(feat)
            elif method_mapper[feat] == 'CountEncoder':
                self.count_encoder_features.append(feat)
            elif method_mapper[feat] == 'TargetEncoder':
                self.target_encoder_features.append(feat)
            else:
                raise ValueError(
                    '编码方式只支持[OrdinalEncoder, OneHotEncoder, CountEncoder, TargetEncoder], 接收到%s'
                    % feat)

        if self.spark is None:
            if len(self.ordinal_encoder_features) != 0 or len(
                    self.onehot_encoder_features) != 0:
                x_whole = x_train.append(x_val)
                y_whole = None
                if not y_train is None and not y_val is None:
                    y_whole = y_train.append(y_val)

                x_whole = self.ordinal_encoder.fit_transform(x_whole, y_whole)
                x_whole = self.onehot_encoder.fit_transform(x_whole, y_whole)
                x_train = x_whole[:len(x_train)]
                x_val = x_whole[len(x_train):]

            x_train = self.count_encoder.fit_transform(x_train, y_train)
            x_val = self.count_encoder.transform(x_val, y_val)
            x_train = self.target_encoder.fit_transform(x_train, y_train)
            x_val = self.target_encoder.transform(x_val, y_val)

            if self.save_encoder:
                self.save_encoder()
        return x_train, y_train, x_val, y_val

    def transform(self, x, y=None):
        x = self.ordinal_encoder.transform(x, y)
        x = self.onehot_encoder.transform(x, y)
        x = self.count_encoder.transform(x, y)
        x = self.target_encoder.transform(x, y)
        return x, y

    def fit_transform(self,
                      x_train,
                      x_val=None,
                      y_train=None,
                      y_val=None,
                      method_mapper=None):
        """
        Parameters
        ----------

        x_train: pd.DataFrame

        x_val: pd.DataFrame

        y_train: pd.DataFrame

        y_val: pd.DataFrame
        
        method_mapper: dict
            a mapping of feature to EncodeMethod
            example mapping: 
            {
                'feature1': OrdinalEncoder,
                'feature2': OneHotEncoder,
                'feature3': CountEncoder,
                'feature4': TargetEncoder,
            }
        """
        self.fit(x_train, x_val, y_train, y_val, method_mapper)
        x_train, y_train = self.transform(x_train, y_train)
        if x_val is not None:
            x_val, y_val = self.transform(x_val, y_val)
        return x_train, y_train, x_val, y_val

    def save_encoder(self):
        now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))
        os.makedirs(os.path.join(self.logdir, now))

        with open(os.path.join(self.logdir, now, 'OrdinalEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.ordinal_encoder, f)
        with open(os.path.join(self.logdir, now, 'OneHotEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.onehot_encoder, f)
        with open(os.path.join(self.logdir, now, 'CountEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.count_encoder, f)
        with open(os.path.join(self.logdir, now, 'TargetEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.target_encoder, f)

        with open(
                os.path.join(self.logdir, now, 'OrdinalEncoderFeatures.json'),
                'w') as f:
            json.dump(self.ordinal_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'OneHotEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.onehot_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'CountEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.count_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'TargetEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.target_encoder_features, f)

    def load_encoder(self, logdir=None):
        with open(os.path.join(self.logdir, 'OrdinalEncoder.pkl'), 'wb') as f:
            pickle.dump(self.ordinal_encoder, f)
        with open(os.path.join(self.logdir, 'OneHotEncoder.pkl'), 'wb') as f:
            pickle.dump(self.onehot_encoder, f)
        with open(os.path.join(self.logdir, 'CountEncoder.pkl'), 'wb') as f:
            pickle.dump(self.count_encoder, f)
        with open(os.path.join(self.logdir, 'TargetEncoder.pkl'), 'wb') as f:
            pickle.dump(self.target_encoder, f)

        with open(os.path.join(self.logdir, 'OrdinalEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.ordinal_encoder_features, f)
        with open(os.path.join(self.logdir, 'OneHotEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.onehot_encoder_features, f)
        with open(os.path.join(self.logdir, 'CountEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.count_encoder_features, f)
        with open(os.path.join(self.logdir, 'TargetEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.target_encoder_features, f)
FrequencyEncoding(X_train, X_test, categorical_wo_version + ['SmartScreen'])

utils.reduce_mem_usage(X_train)
utils.reduce_mem_usage(X_test)

# SmartScreenのみHasDetectionsでTargetEncodingする。
te = TargetEncoder(cols=['SmartScreen'],
                   drop_invariant=False,
                   handle_unknown='impute',
                   impute_missing=True,
                   min_samples_leaf=100,
                   return_df=True,
                   smoothing=1.0,
                   verbose=1)

X_train = te.fit_transform(X_train, y_train)
X_test = te.transform(X_test)

# AVProductStatesIdentifierのFrequencyに対してTarget Encodingしている。
pseudoTarget = 'AVProductStatesIdentifier_freq'
alpha = 0.5
min_samples_leaf = 100
smooth_coeff = 1.0
impute = True

for col in tqdm(categorical_wo_version):

    global_mean = (1 - alpha) * X_train[pseudoTarget].astype(
        float).mean() + alpha * X_test[pseudoTarget].astype(float).mean()
    summary = (1 - alpha) * X_train[[col, pseudoTarget]].groupby(
        [col])[pseudoTarget].agg(['mean', 'count']) + alpha * X_test[[
Example #18
0
# 匿名特征组合
for i in range(15):
    for j in range(i + 1, 15):
        df[f'v_{i}_add_v_{j}'] = df[f'v_{i}'] + df[f'v_{j}']
        df[f'v_{i}_minus_v_{j}'] = df[f'v_{i}'] - df[f'v_{j}']
        df[f'v_{i}_multiply_v_{j}'] = df[f'v_{i}'] * df[f'v_{j}']
        df[f'v_{i}_div_v_{j}'] = df[f'v_{i}'] / df[f'v_{j}']

df_train = df[df['is_train'] == 1]
df_test = df[df['is_train'] == 0]
y_train = df_train['price']

enc = TargetEncoder(cols=['regionCode', 'city', 'model', 'brand'])

# 高基离散特征编码
df_train = enc.fit_transform(df_train, y_train)
df_test = enc.transform(df_test)

# 删除无效编码
delete_features = [
    'SaleID', 'name', 'regDate', 'offerType', 'seller', 'bodyType', 'fuelType',
    'gearbox', 'notRepairedDamage', 'creatDate', 'is_train'
]
for feature in delete_features:
    del df_train[feature]
    del df_test[feature]
del df_test['price']

for column in df_train.columns:
    print(column)
Example #19
0
def PreprocessData(df):
    # Remove all NaN rows
    df_preprocessed = df.dropna()

    # one hot encoding nationality
    encoder = TargetEncoder()
    df_preprocessed['Nationality Encoded'] = encoder.fit_transform(
        df_preprocessed['Nationality'], df_preprocessed['Overall'])
    df_preprocessed.pop('Nationality')

    # one hot encoding club (remove NaN first)
    encoder = TargetEncoder()
    df_preprocessed['Club Encoded'] = encoder.fit_transform(
        df_preprocessed['Club'], df_preprocessed['Overall'])
    df_preprocessed.pop('Club')

    # onehot encoding preferred foot (remove NaN first)\
    encoder = TargetEncoder()
    df_preprocessed['Preferred Foot Encoded'] = encoder.fit_transform(
        df_preprocessed['Preferred Foot'], df_preprocessed['Overall'])
    df_preprocessed.pop('Preferred Foot')

    # encode Workrate
    encoder = TargetEncoder()
    df_preprocessed['Work Rate Encoded'] = encoder.fit_transform(
        df_preprocessed['Work Rate'], df_preprocessed['Overall'])
    df_preprocessed.pop('Work Rate')

    # encode BodyType
    encoder = TargetEncoder()
    df_preprocessed['Body Type Encoded'] = encoder.fit_transform(
        df_preprocessed['Body Type'], df_preprocessed['Overall'])
    df_preprocessed.pop('Body Type')

    # encode position
    encoder = TargetEncoder()
    df_preprocessed['Position Encoded'] = encoder.fit_transform(
        df_preprocessed['Position'], df_preprocessed['Overall'])
    df_preprocessed.pop('Position')

    # Value
    value = df_preprocessed.Value
    newvalue = []
    for ITEM in value:
        temp = ITEM.split("€")[1]
        if ITEM[-1] == 'K':
            newvalue.append(float(temp.split("K")[0]))
        elif ITEM[-1] == 'M':
            newvalue.append(float(temp.split("M")[0]))
        else:
            newvalue.append(float(temp))
    df_preprocessed['Value'] = newvalue
    newvalue = []

    # Wage
    wage = df_preprocessed.Wage
    newwage = []
    for ITEM in wage:
        temp = ITEM.split("€")[1]
        newwage.append(float(temp.split("K")[0]))
    df_preprocessed['Wage'] = newwage
    newwage = []

    # Weight
    weight = df_preprocessed.Weight
    newweight = []
    for ITEM in weight:
        newweight.append(float(ITEM.split("lbs")[0]))
    df_preprocessed['Weight'] = newweight
    newweight = []

    # Height
    newheight = []
    height = df_preprocessed.Height
    for ITEM in height:
        feet, inch = float(ITEM.split("'")[0]), float(ITEM.split("'")[1])
        newheight.append(feet * 30.48 + inch * 2.54)
    df_preprocessed['Height'] = newheight
    newheight = []

    # All the positions, weet niet wat we daar mee willlen
    positions = [
        'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM',
        'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB',
        'LB', 'LCB', 'CB', 'RCB', 'RB'
    ]

    for position in positions:
        data = []
        new_data = []
        data = df_preprocessed[position]
        for ITEM in data:
            new_data.append(
                float(ITEM.split("+")[0]) + float(ITEM.split("+")[1]))
        df_preprocessed[position] = new_data
        new_data = []

    return df_preprocessed
Example #20
0
def target_encoding(x_train_cat, x_val_cat, y_train):
    target_encoder = TargetEncoder()
    scaler = StandardScaler()

    x_train_reference = x_train_cat
    x_train_target = target_encoder.fit_transform(x_train_cat, y_train)
    x_train_target = pd.DataFrame(scaler.fit_transform(x_train_target),
                                  columns=x_train_target.columns,
                                  index=x_train_target.index)

    x_train_reference = x_train_reference.join(
        x_train_target.add_suffix("_targetenc"))

    #Referenztabellen mit Encodings erstellen
    energietyp = x_train_reference[['energietyp', 'energietyp_targetenc']]
    energietyp = energietyp.drop_duplicates(subset=['energietyp'])
    energietyp.to_sql(name='Encoding_energietyp',
                      con=main.setup_database(r"Datenbank/ImmoDB.db"),
                      if_exists='replace')

    energie_effizienzklasse = x_train_reference[[
        'energie_effizienzklasse', 'energie_effizienzklasse_targetenc'
    ]]
    energie_effizienzklasse = energie_effizienzklasse.drop_duplicates(
        subset=['energie_effizienzklasse'])
    energie_effizienzklasse.to_sql(
        name='Encoding_energie_effizienzklasse',
        con=main.setup_database(r"Datenbank/ImmoDB.db"),
        if_exists='replace')

    heizung = x_train_reference[['heizung', 'heizung_targetenc']]
    heizung = heizung.drop_duplicates(subset=['heizung'])
    heizung.to_sql(name='Encoding_heizung',
                   con=main.setup_database(r"Datenbank/ImmoDB.db"),
                   if_exists='replace')

    immobilienart = x_train_reference[[
        'immobilienart', 'immobilienart_targetenc'
    ]]
    immobilienart = immobilienart.drop_duplicates(subset=['immobilienart'])
    immobilienart.to_sql(name='Encoding_immobilienart',
                         con=main.setup_database(r"Datenbank/ImmoDB.db"),
                         if_exists='replace')

    immobilienzustand = x_train_reference[[
        'immobilienzustand', 'immobilienzustand_targetenc'
    ]]
    immobilienzustand = immobilienzustand.drop_duplicates(
        subset=['immobilienzustand'])
    immobilienzustand.to_sql(name='Encoding_immobilienzustand',
                             con=main.setup_database(r"Datenbank/ImmoDB.db"),
                             if_exists='replace')

    Grad_der_Verstädterung = x_train_reference[[
        'Grad_der_Verstädterung', 'Grad_der_Verstädterung_targetenc'
    ]]
    Grad_der_Verstädterung = Grad_der_Verstädterung.drop_duplicates(
        subset=['Grad_der_Verstädterung'])
    Grad_der_Verstädterung.to_sql(
        name='Encoding_Grad_der_Verstädterung',
        con=main.setup_database(r"Datenbank/ImmoDB.db"),
        if_exists='replace')

    sozioökonmische_Lage = x_train_reference[[
        'sozioökonomische_Lage', 'sozioökonomische_Lage_targetenc'
    ]]
    sozioökonmische_Lage = sozioökonmische_Lage.drop_duplicates(
        subset=['sozioökonomische_Lage'])
    sozioökonmische_Lage.to_sql(
        name='Encoding_sozioökonmische_Lage',
        con=main.setup_database(r"Datenbank/ImmoDB.db"),
        if_exists='replace')

    x_val_target = target_encoder.transform(x_val_cat)
    x_val_target = pd.DataFrame(scaler.fit_transform(x_val_target),
                                columns=x_val_target.columns,
                                index=x_val_target.index)
    return x_train_target, x_val_target
Example #21
0
        X_val = X_train.iloc[val_idx].copy()
        y_val = y_train[val_idx]
        X_tst = X_test.copy()
        print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30)

        # with timer('weight of evidence'):
        #     cat_cols = X_trn.select_dtypes(['object']).columns.tolist()
        #     woe = WeightOfEvidence(cols=cat_cols, suffix='woe')
        #     X_trn = pd.concat([X_trn, woe.fit_transform(X_trn.loc[:, cat_cols], y_trn)], axis=1)
        #     X_val = pd.concat([X_val, woe.transform(X_val.loc[:, cat_cols])], axis=1)
        #     X_tst = pd.concat([X_tst, woe.transform(X_tst.loc[:, cat_cols])], axis=1)

        with timer('target encoding'):
            cat_cols = X_trn.select_dtypes(['object']).columns.tolist()
            te = TargetEncoder(cols=cat_cols)
            X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols],
                                                      y_trn)
            X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols])
            X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols])

        # with timer('calc sample weight'):
        #     X_trn['is_test'] = 0
        #     X_tst['is_test'] = 1
        #     df = pd.concat([X_trn, X_tst])
        #     X = df.drop('is_test', axis=1)
        #     y = df.is_test.ravel()
        #     model = lgb.LGBMClassifier(**calc_weight_params)
        #     model.fit(X, y)
        #     proba = np.sqrt(rankdata(model.predict_proba(X)[:len(X_trn), 1])/len(X_trn))
        #     X_trn.drop('is_test', axis=1)
        #     X_tst.drop('is_test', axis=1)
def run(name, feats, params, fit_params, fill=-9999):
    logger = getLogger(name)
    logger.setLevel(DEBUG)

    ch = StreamHandler()
    ch.setLevel(DEBUG)

    handler = StreamHandler()
    handler.setLevel(DEBUG)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    train = pd.read_feather(str(TRAIN))

    with timer('load datasets'):
        X_train, y_train, X_test, cv = load_dataset(feats)
        print('train:', X_train.shape)
        print('test :', X_test.shape)

    with timer('clean datasets'):
        # drop id cols
        id_cols = X_train.filter(regex='(SK_ID_CURR|SK_ID_PREV)').columns
        print('drop id:', id_cols.tolist())
        X_train.drop(id_cols, axis=1, inplace=True)
        X_test.drop(id_cols, axis=1, inplace=True)

        # drop columns which contains many NaN
        ref_train = X_train.isnull().mean() > 0.95
        ref_test = X_test.isnull().mean() > 0.95
        nan_cols = X_train.columns[ref_train | ref_test]
        print('drop many nan:', nan_cols.tolist())
        X_train.drop(nan_cols, axis=1, inplace=True)
        X_test.drop(nan_cols, axis=1, inplace=True)

        print('train:', X_train.shape)
        print('test :', X_test.shape)

    with timer('impute missing'):
        if fill == 'mean':
            assert X_train.mean().isnull().sum() == 0
            print('fill nan with mean')
            X_train.fillna(X_train.mean(), inplace=True)
            X_test.fillna(X_train.mean(), inplace=True)
        else:
            print(f'fill nan with {fill}')
            X_train.fillna(fill, inplace=True)
            X_test.fillna(fill, inplace=True)
        assert X_train.isnull().sum().sum() == 0
        assert X_test.isnull().sum().sum() == 0

    if 'colsample_bytree' in params and params['colsample_bytree'] == 'auto':
        n_samples = X_train.shape[1]
        params['colsample_bytree'] = np.sqrt(n_samples) / n_samples
        print(f'set colsample_bytree = {params["colsample_bytree"]}')

    with timer('training'):
        cv_results = []
        cv_df = pd.DataFrame(index=range(len(y_train)),
                             columns=range(cv.get_n_splits()))
        test_df = pd.DataFrame()
        feat_df = None
        for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
            X_trn = X_train.iloc[trn_idx].copy()
            y_trn = y_train[trn_idx]
            X_val = X_train.iloc[val_idx].copy()
            y_val = y_train[val_idx]
            X_tst = X_test.copy()
            print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30)

            with timer('target encoding'):
                cat_cols = X_trn.select_dtypes(['object']).columns.tolist()
                te = TargetEncoder(cols=cat_cols)
                X_trn.loc[:,
                          cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols],
                                                       y_trn)
                X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols])
                X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols])

            with timer('fit'):
                model = lgb.LGBMClassifier(**params)
                model.fit(X_trn,
                          y_trn,
                          eval_set=[(X_val, y_val)],
                          **fit_params)

            p = model.predict_proba(X_val)[:, 1]
            cv_df.loc[val_idx, i] = p
            cv_results.append(roc_auc_score(y_val, p))
            test_df[i] = model.predict_proba(X_tst)[:, 1]
            if feat_df is None:
                feat_df = pd.DataFrame(index=X_trn.columns)
            feat_df[i] = model.feature_importances_

    valid_score = np.mean(cv_results)
    message = f"""cv: {valid_score:.5f}
scores: {[round(c, 4) for c in cv_results]}
feats: {feats}
model_params: {params}
fit_params: {fit_params}"""

    send_line_notification(message)

    with timer('output results'):
        RESULT_DIR = OUTPUT / (timestamp() + '_' + name)
        RESULT_DIR.mkdir()

        # output cv prediction
        tmp = pd.DataFrame({
            'SK_ID_CURR': train['SK_ID_CURR'],
            'TARGET': cv_df.mean(axis=1)
        })
        tmp.to_csv(RESULT_DIR / f'{name}_cv.csv', index=None)

        # output test prediction
        pred = test_df.mean(axis=1).ravel()
        generate_submit(pred,
                        f'{name}_{valid_score:.5f}',
                        RESULT_DIR,
                        compression=False)

        # output feature importances
        feat_df = (feat_df / feat_df.mean(axis=0)) * 100
        feat_df.mean(axis=1).sort_values(ascending=False).to_csv(RESULT_DIR /
                                                                 'feats.csv')
        imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50]
        imp[::-1].plot.barh(figsize=(20, 15))
        plt.savefig(str(RESULT_DIR / 'feature_importances.pdf'),
                    bbox_inches='tight')

        print('=' * 60)
        print(message)
        print('=' * 60)