def target_encode():
    from category_encoders.target_encoder import TargetEncoder
    tr = pd.read_csv('./data/tr.csv')
    te = pd.read_csv('./data/te.csv')
    y = tr['TARGET'].astype(int)
    tr.drop(['TARGET'], axis=1, inplace=True)

    encode_model = TargetEncoder(verbose=1, min_samples_leaf=100)

    cate_col = []
    for col in tr.columns:
        if tr[col].dtype == 'object':
            cate_col.append(col)

    encode_model.fit(tr, y)
    tr = encode_model.transform(tr)
    te = encode_model.transform(te)

    tr = tr[cate_col]
    te = te[cate_col]
    tr.columns = ['TE_' + col for col in cate_col]
    te.columns = ['TE_' + col for col in cate_col]
    print(tr.info())
    print(te.info())
    tr.to_csv("./data/target_tr.csv", index=False)
    te.to_csv("./data/target_te.csv", index=False)
Example #2
0
    def fit(self, X_df, y=None):
        def regroup_cat(X, liste):
            if X not in liste:
                return ('other')
            else:
                return (X)

        self.prop_to_keep = [
            'Apartment', 'Serviced apartment', 'Condominium', 'Loft'
        ]
        self.prop_transformer = TargetEncoder()
        self.prop_transformer.fit(
            X_df['property_type'].apply(
                lambda x: regroup_cat(x, self.prop_to_keep)), y)

        self.pol_to_keep = [
            'flexible', 'strict_14_with_grace_period', 'moderate',
            'moderate_new'
        ]
        self.pol_transformer = TargetEncoder()
        self.pol_transformer.fit(
            X_df['cancellation_policy'].apply(
                lambda x: regroup_cat(x, self.pol_to_keep)), y)

        self.room_transformer = OrdinalEncoder()
        self.room_transformer.fit(X_df['room_type'])

        self.city_transformer = OneHotEncoder(handle_unknown='ignore')
        self.city_transformer.fit(pd.DataFrame(X_df['city_origin']))

        # numeric_transformer = Pipeline(steps = [('impute', SimpleImputer(strategy='median'))])

        return self
def target_encoder(params):
    train = params[0].astype('str')
    test = params[1].astype('str')
    target = params[2]
    te = TargetEncoder(return_df=False)
    train = te.fit_transform(train.reshape(-1, 1), target.reshape(-1, 1))
    test = te.transform(test.reshape(-1, 1))
    return train.flatten(), test.flatten()
Example #4
0
def fit_target_encoder(train_imputed_categorical_df: pd.DataFrame,
                       train_transformed_target: pd.DataFrame):
    target_encoder = TargetEncoder(
        cols=train_imputed_categorical_df.columns.values)

    target_encoder.fit(X=train_imputed_categorical_df,
                       y=train_transformed_target)
    return target_encoder
Example #5
0
def label_encoding_fit(X, y, cols):
    '''Label - Takes X_train, y_train, columns to be encoded, saves encoded files '''
    for col in cols:
        print("Encoding for column: {}".format(col))
        encoder = TargetEncoder(cols=[col])
        encoder.fit(X[col], y)
        write_encoder(encoder, 'label', col)
    return
Example #6
0
    def feature_importance(url, dataloaded, rows):
        # If dataset is not loaded
        if dataloaded is None:
            return [], "No file"

        # Get dataset if pickle exists
        data_id = int(re.search(r"data/(\d+)", url).group(1))
        try:
            df = pd.read_pickle("cache/df" + str(data_id) + ".pkl")
        except OSError:
            return [], "No file"

        # Get table of metadata
        meta_data = pd.DataFrame(rows)
        try:
            target_attribute = meta_data[meta_data["Target"] == "true"][
                "Attribute"
            ].values[0]
            target_type = meta_data[meta_data["Target"] == "true"]["DataType"].values[0]
        except IndexError:
            return "No target found", "No target found"

        # Feature importance bar plot
        from category_encoders.target_encoder import TargetEncoder

        x = df.drop(target_attribute, axis=1)
        y = df[target_attribute]

        te = TargetEncoder()
        if target_type == "nominal" or target_type == "string":
            y = pd.Categorical(y).codes
            x = clean_dataset(x)
            x = te.fit_transform(x, y)
            rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
            rf.fit(x, y)
        else:
            x = clean_dataset(x)
            x = te.fit_transform(x, y)
            rf = RandomForestRegressor(n_estimators=10, n_jobs=-1)
            rf.fit(x, y)

        fi = pd.DataFrame(
            rf.feature_importances_, index=x.columns, columns=["importance"]
        )
        fi = fi.sort_values("importance", ascending=False).reset_index()
        trace = go.Bar(y=fi["index"], x=fi["importance"], name="fi", orientation="h")
        layout = go.Layout(
            autosize=False, margin={"l": 100, "t": 0}, height=500, hovermode="closest"
        )
        figure = go.Figure(data=[trace], layout=layout)

        fi.to_pickle("cache/fi" + str(data_id) + ".pkl")

        return html.Div(dcc.Graph(figure=figure), className="twelve columns"), "done"
def target_encoder(cols, train_set, train_y, test_set):
    # handle_unknown 和 handle_missing 被设定为 'value'
    # 在目标编码中,handle_unknown 和 handle_missing 仅接受 ‘error’, ‘return_nan’ 及 ‘value’ 设定
    # 两者的默认值均为 ‘value’, 即对未知类别或缺失值填充训练集的因变量平均值
    encoder = TargetEncoder(cols=cols,
                            handle_unknown='value',
                            handle_missing='value').fit(train_set, train_y)
    encoded_train = encoder.transform(train_set)  # 转换训练集
    encoded_test = encoder.transform(test_set)  # 转换测试集

    return encoded_train, encoded_test
Example #8
0
def encode_features(features, labels):
    """Encode categorical features with TargetEncoder"""

    features_columns = features.columns.values.tolist()

    start_time = time.time()
    enc = TargetEncoder(cols=features_columns,
                        return_df=True).fit(features, labels)
    encoded_features = enc.transform(features)
    print("--- %s seconds ---" % (time.time() - start_time))
    return encoded_features
Example #9
0
class target_enc(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, df, y=None):
        self.encoder = TargetEncoder(handle_unknown='value', cols=self.columns)
        self.encoder = self.encoder.fit(df, y)
        return self

    def transform(self, df, y=None):
        df_ = df.copy()

        return self.encoder.transform(df_, y)
Example #10
0
    def feature_importance(url, tab3, rows):
        data_id = int(re.search('data/(\d+)', url).group(1))
        try:
            df = pd.read_pickle('cache/df' + str(data_id) + '.pkl')
        except OSError:
            return [], "No file"
        meta_data = pd.DataFrame(rows)
        try:
            target_attribute = meta_data[meta_data["Target"] ==
                                         "true"]["Attribute"].values[0]
            target_type = (
                meta_data[meta_data["Target"] == "true"]["DataType"].values[0])
        except IndexError:
            return "No target found", "No target found"

        # Feature importance bar plot

        from category_encoders.target_encoder import TargetEncoder
        x = df.drop(target_attribute, axis=1)
        y = df[target_attribute]

        te = TargetEncoder()
        if target_type == "nominal" or target_type == "string":
            y = pd.Categorical(y).codes
            x = clean_dataset(x)
            x = te.fit_transform(x, y)
            rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
            rf.fit(x, y)
        else:
            x = clean_dataset(x)
            x = te.fit_transform(x, y)
            rf = RandomForestRegressor(n_estimators=10, n_jobs=-1)
            rf.fit(x, y)
        fi = pd.DataFrame(rf.feature_importances_,
                          index=x.columns,
                          columns=['importance'])
        fi = fi.sort_values('importance', ascending=False).reset_index()
        trace = go.Bar(y=fi['index'],
                       x=fi['importance'],
                       name='fi',
                       orientation='h')
        layout = go.Layout(autosize=False,
                           margin=dict(l=100),
                           width=800,
                           height=500,
                           hovermode='closest')
        figure = go.Figure(data=[trace], layout=layout)

        fi.to_pickle('cache/fi' + str(data_id) + '.pkl')

        return html.Div(dcc.Graph(figure=figure)), "done"
def target_encode(data, label, encoder=None):
    """

    :param data:
    :param label:
    :param encoder: if supplied the encoder will be used to predict onto data
    :return:
    """
    if encoder is None:
        encoder = TargetEncoder()
        data = encoder.fit_transform(data, label)
        return encoder, data
    else:
        return encoder, encoder.transform(data, label)
Example #12
0
 def create_features(self):
     _df = pd.concat([train, test]).reset_index(drop=True)
     _df = TargetEncoder(smoothing=0.1).fit_transform(
         _df['dating_period'].astype(object), np.log1p(_df['likes'])
     ).rename(columns={'dating_period':'targetencoding_dating_period'})
     self.train = _df[:len(train)]
     self.test = _df[len(train):].reset_index(drop=True)
Example #13
0
    def target_encoder(cols, train_set, train_y, test_set):
        """
            特征无内在顺序,category数量 > 4
            Target encoding 采用 target mean value (among each category) 来给categorical feature做编码。
            handle_unknown 和 handle_missing 被设定为 'value'
            在目标编码中,handle_unknown 和 handle_missing 仅接受 ‘error’, ‘return_nan’ 及 ‘value’ 设定
            两者的默认值均为 ‘value’, 即对未知类别或缺失值填充训练集的因变量平均值
        """
        encoder = TargetEncoder(cols=cols,
                                handle_unknown='value',
                                handle_missing='value').fit(
                                    train_set, train_y)
        encoded_train = encoder.transform(train_set)  # 转换训练集
        encoded_test = encoder.transform(test_set)  # 转换测试集

        return encoded_train, encoded_test
Example #14
0
def onehot_or_targ(X, y, categorical, k):
    ''' Returns the X, y with encoded categorical variables based on a threshold
     value k.

    Parameters:
    -----------
    X: pd.DataFrame
        Contains the dataframe of a given dataset excluding its target column.
    y: pd.Series
        Contains the series of the target of a given dataset.
    categorical: list
        Contains the names of the categorical columns.
    k: int
        Contains threshold value to determine whether to perform target encoding
        or one-hot encoding.

    Returns:
    --------
    pd.DataFrame, pd.Series
        Contains an updated pd.DataFrame with encoding of categorical features,
            contains an updated pd.Series with encoding of a categorical target.
    '''
    for column in categorical:
        if len(X[column].unique()) > k:
            if X[column].dtype.name == 'category':
                X[column] = X[column].cat.codes
            if y.dtype.name == 'category':
                y = y.cat.codes
            X = TargetEncoder(cols=[column]).fit_transform(X, y)
        else:
            X = OneHotEncoder(cols=[column]).fit_transform(X)
    return X, y
Example #15
0
 def create_features(self):
     _df = technique_df.merge(train[['object_id', 'likes']], on='object_id', how='left')
     group = pd.concat([
         _df['object_id'],
         TargetEncoder(smoothing=0.1).fit_transform(_df['name'], np.log1p(_df['likes'])),
     ], axis=1).groupby('object_id').mean().rename(columns={'name':'targetencoding_technique'})
     self.train = train[['object_id']].merge(group, on='object_id', how='left').drop(columns='object_id', axis=1)
     self.test = test[['object_id']].merge(group, on='object_id', how='left').drop(columns='object_id', axis=1)
Example #16
0
def _encode():
    train = pd.read_feather('./data/application_train.preprocessed.feather')
    test = pd.read_feather('./data/application_test.preprocessed.feather')
    df = pd.concat([train, test], sort=False).reset_index(drop=True)
    cols = [
        'CODE_GENDER',
        'FLAG_OWN_CAR',
        'FLAG_OWN_REALTY',
        'NAME_TYPE_SUITE',
        'NAME_INCOME_TYPE',
        'NAME_EDUCATION_TYPE',  # Level of highest education the client achieved,  # noqa
        'NAME_FAMILY_STATUS',
        'NAME_HOUSING_TYPE',
        'FLAG_MOBIL',
        'FLAG_EMP_PHONE',
        'FLAG_WORK_PHONE',
        'FLAG_CONT_MOBILE',
        'FLAG_PHONE',
        'FLAG_EMAIL',
        'OCCUPATION_TYPE',
        'WEEKDAY_APPR_PROCESS_START',
        'HOUR_APPR_PROCESS_START',
        'REG_REGION_NOT_LIVE_REGION',
        'REG_REGION_NOT_WORK_REGION',
        'LIVE_REGION_NOT_WORK_REGION',
        'REG_CITY_NOT_LIVE_CITY',
        'REG_CITY_NOT_WORK_CITY',
        'LIVE_CITY_NOT_WORK_CITY',
        'ORGANIZATION_TYPE',
        'FONDKAPREMONT_MODE',
        'HOUSETYPE_MODE',
        'WALLSMATERIAL_MODE',
        'EMERGENCYSTATE_MODE',
        'NAME_CONTRACT_TYPE',  # Identification if loan is cash or revolving,
    ]
    encoder = TargetEncoder(cols=cols)
    encoder.fit(df[cols], df['TARGET'])
    res = encoder.transform(df[cols])
    res.columns = ['{}_ENC'.format(c) for c in res.columns]
    res['SK_ID_CURR'] = df['SK_ID_CURR']
    res.to_feather('./data/app.enc.feather')
Example #17
0
    def target_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'error', 'return_nan'  and 'value', defaults to 'value', which returns the target mean.
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
            min_samples_leaf: int
                minimum samples to take category average into account.
            smoothing: float
                smoothing effect to balance categorical average vs prior. Higher value means stronger regularization.
                The value must be strictly bigger than 0.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        min_samples_leaf = set_default_vale("min_samples_leaf", configger, 1)
        smoothing = set_default_vale("smoothing", configger, 1.0)

        encoder = TargetEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                handle_missing=handle_missing,
                                handle_unknown=handle_unknown, min_samples_leaf=min_samples_leaf, smoothing=smoothing)

        res = encoder.fit_transform(X, y)

        return res
Example #18
0
 def _feature_encode(self, data):
     dummy_cols = []
     for col in data.cat_features:
         # merge categorical features with low frequencies
         if data.train_df[col].nunique() / len(data.train_df[col]) < 0.1:
             for name, count in data.train_df[col].value_counts().items():
                 if count / len(data.train_df[col]) < 0.01:
                     data.train_df[col].replace(name, 'Rare', inplace=True)
         if data.test_df[col].nunique() / len(data.test_df[col]) < 0.1:
             for name, count in data.test_df[col].value_counts().items():
                 if count / len(data.test_df[col]) < 0.01:
                     data.test_df[col].replace(name, 'Rare', inplace=True)
         # target-encode categorical features with high number of unique values
         if data.train_df[col].nunique() > 10:
             from category_encoders.target_encoder import TargetEncoder
             encoder = TargetEncoder(cols=col)
             encoder.fit(data.train_df[col], data.train_df[data.target_var])
             data.train_df[col] = encoder.transform(data.train_df[col])
             data.test_df[col] = encoder.transform(data.test_df[col])
         else:
             dummy_cols.append(col)
     # create dummy variables from categorical features with low number of unique values
     data.train_df = pd.get_dummies(data.train_df,
                                    columns=dummy_cols,
                                    drop_first=True)
     data.test_df = pd.get_dummies(data.test_df,
                                   columns=dummy_cols,
                                   drop_first=True)
     data.target_df = data.train_df[data.target_var]
Example #19
0
def target_encode_Stores(df, enc=None):
    """Target encode the Store variable using the category_encoders module

    Args:
        df: Data
        enc: Existing Encoder / if None retrain new encoder
    """

    target = df['Sales'].values
    stores = df['Store'].astype(str)

    if not enc:
        print("Fit TargetEncoder...")
        enc = TargetEncoder()
        new_store = enc.fit_transform(stores, target)
    else:
        print("Transform using existing TargetEncoder...")
        new_store = enc.transform(stores, target)

    df.loc[:, 'Store'] = new_store

    return new_store, enc
 def _get_single_encoder(encoder_name: str, cols, smoothing):
     """
     Get encoder by its name
     :param encoder_name: Name of desired encoder
     :param cat_cols: Cat columns for encoding
     :return: Categorical encoder
     """
     if encoder_name == "TargetEncoder":
         encoder = TargetEncoder(cols=cols,
                                 smoothing=smoothing)  #cols, smoothing
     else:
         raise ValueError('NO ENCODER FOUND')
     return encoder
Example #21
0
def target_encode_custom(df: pd.DataFrame, name: str, enc=None):
    """Target encode the Store variable using the category_encoders module

    Args:
        df: Data
        name (str): name of the column to encode
        enc: Existing Encoder / if None retrain new encoder
    """

    target = df['Sales'].values
    stores = df[name].astype(str)

    if not enc:
        print("Fit TargetEncoder...")
        enc = TargetEncoder()
        new_store = enc.fit_transform(stores, target)
    else:
        print("Transform using existing TargetEncoder...")
        new_store = enc.transform(stores, target)

    df.loc[:, name] = new_store

    return new_store, enc
Example #22
0
    def fit(self, x=None, y=None):
        self.params = x.columns
        self.kmn_mod = {}
        self.trg_mod = {}
        for col in x.columns:
            tmp = pd.DataFrame([])
            self.kmn_mod[col] = KMeans(n_clusters=self.n_clust[col])
            self.kmn_mod[col].fit(np.reshape(x[col].values, (-1, 1)))

            tmp[col] = self.kmn_mod[col].predict(
                np.reshape(x[col].values, (-1, 1)))
            self.trg_mod[col] = TargetEncoder()
            self.trg_mod[col].fit(tmp[col].astype("category"), train_y)
        return self
class DateTransformer(BaseEstimator, TransformerMixin):
    """Transforms DATE using target encoding of MONTH."""

    def __init__(self):
        self.encoder = None
        self.month = None

    def fit(self, X, y=None):
        self.month = X.apply(lambda x: x.apply(lambda y: str(y.month)))
        self.encoder = TargetEncoder().fit(self.month, y)
        return self

    def transform(self, X, y=None):
        months = X.apply(lambda x: x.apply(lambda y: str(y.month)))
        target_encoded_month = self.encoder.transform(months)
        return target_encoded_month
def get_single_encoder(encoder_name: str, cat_cols: list):
    """
    Get encoder by its name
    :param encoder_name: Name of desired encoder
    :param cat_cols: Cat columns for encoding
    :return: Categorical encoder
    """
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)
    if encoder_name == "OneHotEncoder":
        encoder = OneHotEncoder(cols=cat_cols)
    if encoder is None:
        raise NotImplementedError("To be implemented")
    return encoder
Example #25
0
def get_single_encoder(encoder_name: str, cat_cols: list):
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == 'OneHotEncoder':
        encoder = OneHotEncoder(cols=cat_cols)

    # assert encoder is not None
    return encoder
    def fit(self, input_df):
        df = pd.read_csv(INPUT_PATH + f'{self.data_name}.csv')
        train = pd.read_csv(INPUT_PATH + 'train.csv')
        vc = df['name'].value_counts()
        idx = vc[vc >= 10].index
        df = df[df['name'].isin(idx)]

        df = df.merge(train[['object_id', 'likes']],
                      on='object_id',
                      how='left')

        self.meta = pd.concat([
            df['object_id'],
            TargetEncoder(smoothing=0.1).fit_transform(df['name'],
                                                       np.log1p(df['likes']))
        ],
                              axis=1).groupby('object_id').agg(
                                  ['mean', 'sum', 'max', 'min', 'std'])
        self.meta.columns = self.meta.columns.droplevel(0)

        return self.transform(input_df)
def target_encode(X, X_test, cols, y):
    te = TargetEncoder(cols=cols, return_df=True)
    X = te.fit_transform(X, y)
    X_test = te.transform(X_test)
    return (X, X_test)
                        ,min_child_samples=200
                        ,colsample_bytree=.2
                        ,reg_alpha=.1
                        ,reg_lambda=.1
                        )
    return lgbr

# 本地验证
kf = KFold(n_splits=10, shuffle=True, random_state=100)
devscore = []
for tidx, didx in kf.split(train.index):
    tf = train.iloc[tidx]
    df = train.iloc[didx]
    tt = target.iloc[tidx]
    dt = target.iloc[didx]
    te = TargetEncoder(cols=tecols)
    tf = te.fit_transform(tf, tt)
    df = te.transform(df)
    lgbr = makelgb()
    lgbr.fit(tf, tt)
    pre = lgbr.predict(df)
    fpr, tpr, thresholds = roc_curve(dt, pre)
    score = auc(fpr, tpr)
    devscore.append(score)
print(np.mean(devscore))

# # 在整个train集上重新训练,预测test,输出结果
# lgbr = makelgb()
# te = TargetEncoder(cols=tecols)
# tf = te.fit_transform(train, target)
# df = te.transform(test)
def lin_model(labelled_data, unlabelled_data):
    """ Parameters: training dataframe, unknown dataframe
        Returns: results dataframe (Instance, Income)

        Drops NaN from training data,
        Replaces NaN in test data with ffill, 
        target-encodes non-numeric fields, 
        scales values,
        80/20 splits data to help verify model, 
        selects features using RFECV, with a lasso mode, cv set to 5,
        uses KNeighborRegressor for 11 nearest neighbours weighted to distance
    """
    print("cleaning data...")
    clean_labelled = labelled_data.dropna()
    clean_unlabelled = unlabelled_data[all_columns]
    # not ideal but fillna the mean freezes for some reason
    clean_unlabelled = clean_unlabelled.fillna(method="ffill") 
    # clean_unlabelled = clean_unlabelled.fillna("None")

    # remove some columns
    # clean_labelled = drop_columns(clean_labelled)
    # clean_unlabelled = drop_columns(clean_unlabelled)

    # print("one hot encoding data...")
    # One hot encoding
    # ohe = OneHotEncoder(
    #     categories="auto", 
    #     handle_unknown="ignore",
    #     sparse=False
    # )
    # clean_labelled = encode_training(ohe, clean_labelled)
    # clean_unlabelled = encode_testing(ohe, clean_unlabelled)

    clean_labelled = constrain_col_vals(clean_labelled)
    clean_unlabelled = constrain_col_vals(clean_unlabelled)
    unknown_data = clean_unlabelled.drop(["Instance"], axis=1)

    print("splitting data into train and test...")
    # 80/20 split
    split = split_data(clean_labelled)
    train_data, train_target, test_data, test_target = split

    print("target encoding data...")
    # Target encoding
    tar_encode = TargetEncoder()
    train_data = tar_encode.fit_transform(train_data, train_target)
    test_data = tar_encode.transform(test_data)
    unknown_data = tar_encode.transform(unknown_data)

    print("scaling values...")
    # scaling values
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    unknown_data = scaler.transform(unknown_data)

    print("selecting features...")
    # feature selection
    lasso = lm.Lasso()
    selector = RFECV(lasso, cv=5)
    train_data = selector.fit_transform(train_data, train_target)
    test_data = selector.transform(test_data)
    unknown_data = selector.transform(unknown_data)

    print("fitting model...")
    # fit model
    # lasso = lm.LassoCV(cv=5)
    # lasso.fit(train_data, train_target)
    neigh = KNeighborsRegressor(
        n_neighbors=11,
        weights="distance"
    )
    neigh.fit(train_data, train_target) 

    print("analysing test results...")
    # validate test
    test_result = neigh.predict(test_data)
    error = np.sqrt(mean_squared_error(test_target, test_result))
    variance = explained_variance_score(test_target, test_result)
    print("Root mean squared error of test data: ", error)
    print("Variance: ", variance)

    print("predicting unknown data...")
    # predict and format
    values = neigh.predict(unknown_data)
    results = pandas.DataFrame({
        "Instance": clean_unlabelled["Instance"].values,
        "Income": values.flatten()
    })
    print("Finished.")
    return results
Example #30
0
    # Deal with unknown values
    DATA.replace("?", np.NaN, inplace=True)
    DATA["collision_type"].fillna(DATA["collision_type"].mode()[0],
                                  inplace=True)
    DATA["property_damage"].fillna(False, inplace=True)
    DATA["police_report_available"].fillna(False, inplace=True)

    # Replace strings with True/False values
    DATA = DATA.replace(("YES", "Y", "NO", "N"), (True, True, False, False))

    # Seperate the data into features and labels
    FEATURES, LABELS = DATA.drop(["fraud_reported"],
                                 axis=1), DATA["fraud_reported"]

    # Use target encoding with smoothing for categorical features (strings)
    FEATURES = TargetEncoder().fit(FEATURES,
                                   LABELS).transform(FEATURES, LABELS)

    # Use SMOTE oversampling with ENN undersampling to balance the dataset
    FEATURES, LABELS = SMOTEENN().fit_sample(FEATURES, LABELS.values.ravel())

    # Split the dataset into test and train datasets
    TRAIN_FEATURES, TEST_FEATURES, TRAIN_LABELS, TEST_LABELS = train_test_split(
        FEATURES, LABELS)

    # Create hyperparameter combinations to test using cross validation
    N_ESTIMATORS_PARAMS = [300, 500, 700, 900, 1100]
    CRITERION_PARAMS = ["gini", "entropy"]
    COMBOS = get_combos(N_ESTIMATORS_PARAMS, CRITERION_PARAMS)
    SCORES = []

    # Create a classifier with each combination of hyperparameters and measure its