예제 #1
0
    def feature_importance(url, dataloaded, rows):
        # If dataset is not loaded
        if dataloaded is None:
            return [], "No file"

        # Get dataset if pickle exists
        data_id = int(re.search(r"data/(\d+)", url).group(1))
        try:
            df = pd.read_pickle("cache/df" + str(data_id) + ".pkl")
        except OSError:
            return [], "No file"

        # Get table of metadata
        meta_data = pd.DataFrame(rows)
        try:
            target_attribute = meta_data[meta_data["Target"] == "true"][
                "Attribute"
            ].values[0]
            target_type = meta_data[meta_data["Target"] == "true"]["DataType"].values[0]
        except IndexError:
            return "No target found", "No target found"

        # Feature importance bar plot
        from category_encoders.target_encoder import TargetEncoder

        x = df.drop(target_attribute, axis=1)
        y = df[target_attribute]

        te = TargetEncoder()
        if target_type == "nominal" or target_type == "string":
            y = pd.Categorical(y).codes
            x = clean_dataset(x)
            x = te.fit_transform(x, y)
            rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
            rf.fit(x, y)
        else:
            x = clean_dataset(x)
            x = te.fit_transform(x, y)
            rf = RandomForestRegressor(n_estimators=10, n_jobs=-1)
            rf.fit(x, y)

        fi = pd.DataFrame(
            rf.feature_importances_, index=x.columns, columns=["importance"]
        )
        fi = fi.sort_values("importance", ascending=False).reset_index()
        trace = go.Bar(y=fi["index"], x=fi["importance"], name="fi", orientation="h")
        layout = go.Layout(
            autosize=False, margin={"l": 100, "t": 0}, height=500, hovermode="closest"
        )
        figure = go.Figure(data=[trace], layout=layout)

        fi.to_pickle("cache/fi" + str(data_id) + ".pkl")

        return html.Div(dcc.Graph(figure=figure), className="twelve columns"), "done"
예제 #2
0
    def feature_importance(url, tab3, rows):
        data_id = int(re.search('data/(\d+)', url).group(1))
        try:
            df = pd.read_pickle('cache/df' + str(data_id) + '.pkl')
        except OSError:
            return [], "No file"
        meta_data = pd.DataFrame(rows)
        try:
            target_attribute = meta_data[meta_data["Target"] ==
                                         "true"]["Attribute"].values[0]
            target_type = (
                meta_data[meta_data["Target"] == "true"]["DataType"].values[0])
        except IndexError:
            return "No target found", "No target found"

        # Feature importance bar plot

        from category_encoders.target_encoder import TargetEncoder
        x = df.drop(target_attribute, axis=1)
        y = df[target_attribute]

        te = TargetEncoder()
        if target_type == "nominal" or target_type == "string":
            y = pd.Categorical(y).codes
            x = clean_dataset(x)
            x = te.fit_transform(x, y)
            rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
            rf.fit(x, y)
        else:
            x = clean_dataset(x)
            x = te.fit_transform(x, y)
            rf = RandomForestRegressor(n_estimators=10, n_jobs=-1)
            rf.fit(x, y)
        fi = pd.DataFrame(rf.feature_importances_,
                          index=x.columns,
                          columns=['importance'])
        fi = fi.sort_values('importance', ascending=False).reset_index()
        trace = go.Bar(y=fi['index'],
                       x=fi['importance'],
                       name='fi',
                       orientation='h')
        layout = go.Layout(autosize=False,
                           margin=dict(l=100),
                           width=800,
                           height=500,
                           hovermode='closest')
        figure = go.Figure(data=[trace], layout=layout)

        fi.to_pickle('cache/fi' + str(data_id) + '.pkl')

        return html.Div(dcc.Graph(figure=figure)), "done"
def target_encoder(params):
    train = params[0].astype('str')
    test = params[1].astype('str')
    target = params[2]
    te = TargetEncoder(return_df=False)
    train = te.fit_transform(train.reshape(-1, 1), target.reshape(-1, 1))
    test = te.transform(test.reshape(-1, 1))
    return train.flatten(), test.flatten()
예제 #4
0
def target_encode(data, label, encoder=None):
    """

    :param data:
    :param label:
    :param encoder: if supplied the encoder will be used to predict onto data
    :return:
    """
    if encoder is None:
        encoder = TargetEncoder()
        data = encoder.fit_transform(data, label)
        return encoder, data
    else:
        return encoder, encoder.transform(data, label)
예제 #5
0
    def target_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'error', 'return_nan'  and 'value', defaults to 'value', which returns the target mean.
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
            min_samples_leaf: int
                minimum samples to take category average into account.
            smoothing: float
                smoothing effect to balance categorical average vs prior. Higher value means stronger regularization.
                The value must be strictly bigger than 0.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        min_samples_leaf = set_default_vale("min_samples_leaf", configger, 1)
        smoothing = set_default_vale("smoothing", configger, 1.0)

        encoder = TargetEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                handle_missing=handle_missing,
                                handle_unknown=handle_unknown, min_samples_leaf=min_samples_leaf, smoothing=smoothing)

        res = encoder.fit_transform(X, y)

        return res
예제 #6
0
def target_encode_Stores(df, enc=None):
    """Target encode the Store variable using the category_encoders module

    Args:
        df: Data
        enc: Existing Encoder / if None retrain new encoder
    """

    target = df['Sales'].values
    stores = df['Store'].astype(str)

    if not enc:
        print("Fit TargetEncoder...")
        enc = TargetEncoder()
        new_store = enc.fit_transform(stores, target)
    else:
        print("Transform using existing TargetEncoder...")
        new_store = enc.transform(stores, target)

    df.loc[:, 'Store'] = new_store

    return new_store, enc
예제 #7
0
def target_encode_custom(df: pd.DataFrame, name: str, enc=None):
    """Target encode the Store variable using the category_encoders module

    Args:
        df: Data
        name (str): name of the column to encode
        enc: Existing Encoder / if None retrain new encoder
    """

    target = df['Sales'].values
    stores = df[name].astype(str)

    if not enc:
        print("Fit TargetEncoder...")
        enc = TargetEncoder()
        new_store = enc.fit_transform(stores, target)
    else:
        print("Transform using existing TargetEncoder...")
        new_store = enc.transform(stores, target)

    df.loc[:, name] = new_store

    return new_store, enc
def target_encode(X, X_test, cols, y):
    te = TargetEncoder(cols=cols, return_df=True)
    X = te.fit_transform(X, y)
    X_test = te.transform(X_test)
    return (X, X_test)
                        ,colsample_bytree=.2
                        ,reg_alpha=.1
                        ,reg_lambda=.1
                        )
    return lgbr

# 本地验证
kf = KFold(n_splits=10, shuffle=True, random_state=100)
devscore = []
for tidx, didx in kf.split(train.index):
    tf = train.iloc[tidx]
    df = train.iloc[didx]
    tt = target.iloc[tidx]
    dt = target.iloc[didx]
    te = TargetEncoder(cols=tecols)
    tf = te.fit_transform(tf, tt)
    df = te.transform(df)
    lgbr = makelgb()
    lgbr.fit(tf, tt)
    pre = lgbr.predict(df)
    fpr, tpr, thresholds = roc_curve(dt, pre)
    score = auc(fpr, tpr)
    devscore.append(score)
print(np.mean(devscore))

# # 在整个train集上重新训练,预测test,输出结果
# lgbr = makelgb()
# te = TargetEncoder(cols=tecols)
# tf = te.fit_transform(train, target)
# df = te.transform(test)
# lgbr.fit(tf, target)
예제 #10
0
def lin_model(labelled_data, unlabelled_data):
    """ Parameters: training dataframe, unknown dataframe
        Returns: results dataframe (Instance, Income)

        Drops NaN from training data,
        Replaces NaN in test data with ffill, 
        target-encodes non-numeric fields, 
        scales values,
        80/20 splits data to help verify model, 
        selects features using RFECV, with a lasso mode, cv set to 5,
        uses KNeighborRegressor for 11 nearest neighbours weighted to distance
    """
    print("cleaning data...")
    clean_labelled = labelled_data.dropna()
    clean_unlabelled = unlabelled_data[all_columns]
    # not ideal but fillna the mean freezes for some reason
    clean_unlabelled = clean_unlabelled.fillna(method="ffill") 
    # clean_unlabelled = clean_unlabelled.fillna("None")

    # remove some columns
    # clean_labelled = drop_columns(clean_labelled)
    # clean_unlabelled = drop_columns(clean_unlabelled)

    # print("one hot encoding data...")
    # One hot encoding
    # ohe = OneHotEncoder(
    #     categories="auto", 
    #     handle_unknown="ignore",
    #     sparse=False
    # )
    # clean_labelled = encode_training(ohe, clean_labelled)
    # clean_unlabelled = encode_testing(ohe, clean_unlabelled)

    clean_labelled = constrain_col_vals(clean_labelled)
    clean_unlabelled = constrain_col_vals(clean_unlabelled)
    unknown_data = clean_unlabelled.drop(["Instance"], axis=1)

    print("splitting data into train and test...")
    # 80/20 split
    split = split_data(clean_labelled)
    train_data, train_target, test_data, test_target = split

    print("target encoding data...")
    # Target encoding
    tar_encode = TargetEncoder()
    train_data = tar_encode.fit_transform(train_data, train_target)
    test_data = tar_encode.transform(test_data)
    unknown_data = tar_encode.transform(unknown_data)

    print("scaling values...")
    # scaling values
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    unknown_data = scaler.transform(unknown_data)

    print("selecting features...")
    # feature selection
    lasso = lm.Lasso()
    selector = RFECV(lasso, cv=5)
    train_data = selector.fit_transform(train_data, train_target)
    test_data = selector.transform(test_data)
    unknown_data = selector.transform(unknown_data)

    print("fitting model...")
    # fit model
    # lasso = lm.LassoCV(cv=5)
    # lasso.fit(train_data, train_target)
    neigh = KNeighborsRegressor(
        n_neighbors=11,
        weights="distance"
    )
    neigh.fit(train_data, train_target) 

    print("analysing test results...")
    # validate test
    test_result = neigh.predict(test_data)
    error = np.sqrt(mean_squared_error(test_target, test_result))
    variance = explained_variance_score(test_target, test_result)
    print("Root mean squared error of test data: ", error)
    print("Variance: ", variance)

    print("predicting unknown data...")
    # predict and format
    values = neigh.predict(unknown_data)
    results = pandas.DataFrame({
        "Instance": clean_unlabelled["Instance"].values,
        "Income": values.flatten()
    })
    print("Finished.")
    return results
                      np.corrcoef(X[self.targetName].values,
                                  encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X


display(df)
temp = []
nfolds = [2, 3, 4]
for n in nfolds:
    te = KFoldTargetEncoderTrain(colnames='horse',
                                 targetName='label',
                                 n_fold=n,
                                 verbosity=False)
    temp.append(te.fit_transform(df).copy().iloc[:, -1])
temp  # 顯示結果差異很大隨著n_folds


# The code beow seems have been not compatible with the code above (because I have revied the one above).
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, train, colNames, encodedName):
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # read the encoded training data first into a dictionary
예제 #12
0
 def target_encoder(self):
     for i in self.categorical_features:
         te = TargetEncoder()
         self.data[f"{i}_te"] = te.fit_transform(self.data[i],
                                                 self.data[self.target])
         self.encoded_features.append(f"{i}_te")

#MEE_encoder = MEstimateEncoder()
#train_mee = MEE_encoder.fit_transform(train_set[feature_list], target)
#test_mee = MEE_encoder.transform(test_set[feature_list])
#print(train_mee.head())
X_train, X_val, y_train, y_val = train_test_split(train_set, target, test_size=0.2, random_state=97)
lr = LinearRegression()
rf = RandomForestRegressor()


# In[48]:


TE_encoder = TargetEncoder()
train_te = TE_encoder.fit_transform(train_set[feature_list], target)
test_te = TE_encoder.transform(test_set[feature_list])
#print(train_te.head())
encoder_list = [ TargetEncoder(), MEstimateEncoder()]
X_train, X_val, y_train, y_val = train_test_split(train_set, target, test_size=0.2, random_state=97)
#X_train, X_val, y_train, y_val = dataset_test()
lr = LinearRegression()

for encoder in encoder_list:
#    print("Test {} : ".format(str(encoder).split('(')[0]), end=" ")

    train_enc = encoder.fit_transform(X_train[feature_list], y_train)
      # test_enc = encoder.transform(test[feature_list])
    val_enc = encoder.transform(X_val[feature_list])
    
    lr.fit(train_enc, y_train)
예제 #14
0
# 查看数据数据
train_data.head()

from sklearn.model_selection import train_test_split

# n2 与 n3 重复性较高,删除 n2
train_data.drop(['n2', 'issueDate'], axis=1, inplace=True)
test_data = test_data[train_data.columns]

# 获取非数值列
s = train_data.dtypes
tecols = s[s == 'object'].index.tolist()

# 将非数值列直接利用TargetEncoder进行离散化编码
te = TargetEncoder(cols=tecols)
tf = te.fit_transform(train_data, target)
df = te.transform(test_data)

# 划分训练集和验证集
X_train_split, X_val, y_train_split, y_val = train_test_split(tf,
                                                              target,
                                                              test_size=0.2)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
"""使用优化后的参数初始化模型(参数通过网格搜索法进行优化,这里没有相关代码)"""
base_params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 14,
test_data["Hair Color"] = test_data["Hair Color"].fillna("N/A")
test_data["Profession"] = test_data["Profession"].fillna("N/A")
#test_numerics
test_data["Age"] = test_data["Age"].fillna(test_data['Age'].mean())
test_data["Year of Record"] = test_data["Year of Record"].fillna(
    test_data['Year of Record'].mean())

numerics_and_catagorics = list(train_data.columns)
X_train_data, X_val, y_train_data, y_val = train_test_split(train_data,
                                                            target,
                                                            test_size=0.2,
                                                            random_state=97)
random_forest = RandomForestRegressor()
#target_encoder
target_encoder = TargetEncoder()
train_data_encoded = target_encoder.fit_transform(
    X_train_data[numerics_and_catagorics], y_train_data)
test_data_encoded = target_encoder.transform(X_val[numerics_and_catagorics],
                                             y_val)
test_data = target_encoder.transform(test_data[numerics_and_catagorics])

print(train_data_encoded.head())
print(test_data_encoded.head())

#random_forest_regressor
r_train_data = random_forest.fit(train_data_encoded, y_train_data)
prediction = r_train_data.predict(test_data_encoded)


def rmse(prediction, target):
    #difference = prediction-target
    #square = **2
예제 #16
0
                         colsample_bytree=.2,
                         reg_alpha=.1,
                         reg_lambda=.1)
    return lgbr


# 本地验证
kf = KFold(n_splits=10, shuffle=True, random_state=100)
devscore = []
for tidx, didx in kf.split(train.index):
    tf = train.iloc[tidx]
    df = train.iloc[didx]
    tt = target.iloc[tidx]
    dt = target.iloc[didx]
    te = TargetEncoder(cols=tecols)
    tf = te.fit_transform(tf, tt)
    df = te.transform(df)
    lgbr = makelgb()
    lgbr.fit(tf, tt)
    pre = lgbr.predict(df)
    fpr, tpr, thresholds = roc_curve(dt, pre)
    score = auc(fpr, tpr)
    devscore.append(score)
print(np.mean(devscore))

# 在整个train集上重新训练,预测test,输出结果
lgbr = makelgb()
te = TargetEncoder(cols=tecols)
tf = te.fit_transform(train, target)
df = te.transform(test)
lgbr.fit(tf, target)