Example #1
0
 def __init__(self, variables=None):
     self.variables = variables
     self.ohe_enc = OneHotCategoricalEncoder(
         variables=self.
         variables,  # we can select which variables to encode
         drop_last=True,
     )  # to return k-1, false to return k
Example #2
0
def feature_engineering(df_input):
    print("\*****FUNCTION feature_engineering*****")

    df = df_input.copy(deep=True)
    global OHCE

    # FE on 5 columns
    # Convert the categorical vars to k-1 dummy vars
    OHCE = OneHotCategoricalEncoder(
        variables=['State'],  # we can select which variables to encode
        drop_last=True)  # to return k-1, false to return k

    OHCE.fit(df)
    df = OHCE.transform(df)
    print(OHCE.encoder_dict_)

    return (df)
Example #3
0
class OHEEncoder(BaseEstimator, TransformerMixin):
    """OHE Encoder categorical encoder"""
    def __init__(self, variables=None):
        self.variables = variables
        self.ohe_enc = OneHotCategoricalEncoder(
            variables=self.
            variables,  # we can select which variables to encode
            drop_last=True,
        )  # to return k-1, false to return k

    def fit(self, X, y=None):
        # persist frequent labels in dictionary
        for feature in X[self.variables]:
            X[feature] = X[feature].astype("object")
        self.ohe_enc.fit(X)
        return self

    def transform(self, X):
        X = X.copy()
        X = self.ohe_enc.transform(X)
        return X
Example #4
0
    def columnEncoding(self, X_train, X_test, y_train, testdata):

        ohe = OneHotCategoricalEncoder(top_categories=None,
                                       variables=['Gender', 'University Degree','Hair Color','Profession','Country'],
                                       drop_last=True)
        ohe.fit(X_train, y_train)
        X_train = ohe.transform(X_train)
        X_test = ohe.transform(X_test)
        testdata = ohe.transform(testdata)

        return X_train, X_test, y_train, testdata
def encode(labelled_data, unlabelled_data, columns):
    encoder = OneHotCategoricalEncoder(
        top_categories=None,
        variables=columns,  # we can select which variables to encode
        drop_last=True)
    encoder.fit(labelled_data)
    labelled_data = encoder.transform(labelled_data)
    unlabelled_data = encoder.transform(unlabelled_data)
    return labelled_data, unlabelled_data
def one_hot_encoder(trainData, predictionData):
    encoder = OneHotCategoricalEncoder(top_categories=None,
                                       variables=[
                                           'housing_situation', 'satisfaction',
                                           'gender', 'hair_color'
                                       ],
                                       drop_last=True)

    encoder.fit(trainData)
    trainDataFrame = encoder.transform(trainData)
    predictionDataFrame = encoder.transform(predictionData)

    return trainDataFrame, predictionDataFrame
Example #7
0
def one_hot_Categ_encoder(train_set, test_set, features, drop_last=False):
    """
    >>  B-ii)  One hot encode the categorical variables:
        here I am writing a function to perform one hot encoding. The function takes 4 arguments: train set, test set, a list of features we want to one_hot_encode and boolean argument that allows us to encode the features into k or k-1 dummy variables. The function returns both the encoded train and test sets.
        * 1) I import the **OneHotCategoricalEncoder** from the **feature_engine.categorical_encoders**
        * 2) I make an instance of the **OneHotCategoricalEncoder**
        *  a) IF **drop_last=False** The default is set make an instance of **OneHotCategoricalEncoder** with k dummy variables.
        *  b) IF **drop_last=True** The default is set make an instance of **OneHotCategoricalEncoder** with k-1 dummy variables.
        * 3) I fit the **training set** to  **OneHotCategoricalEncoder** object instance 
        * 4) tansform and return both **train** and **test** sets
    """
    from feature_engine.categorical_encoders import OneHotCategoricalEncoder

    if drop_last is True:
        encoder = OneHotCategoricalEncoder(
            top_categories=None, variables=features, drop_last=True
        )
    elif drop_last is False:
        encoder = OneHotCategoricalEncoder(top_categories=None, variables=features)
    encoder.fit(train_set)
    return encoder.transform(train_set), encoder.transform(test_set)
# Importing feature-engine - Categorical Variable
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

# OneHotCategoricalEncoder?

df = pd.read_csv("churn_modelling_data.csv")
df.head()

df.describe()
df.shape
df.columns
df.dtypes
df.index

ohe = OneHotCategoricalEncoder(top_categories=None,
                               variables=["Geography", "Gender"],
                               drop_last=False)
df1 = ohe.fit_transform(df)
df1.shape
df1.head()
df1.columns

df1.columns
df1.shape

X = df1.drop(axis=1,
             columns=['RowNumber', 'CustomerId', 'Surname', 'Exited'],
             errors='raise')
X.columns
X.shape
Example #9
0
def test_OneHotCategoricalEncoder(dataframe_enc_big, dataframe_enc_big_na):
    # test case 1: encode all categories into k binary variables, select variables automatically
    encoder = OneHotCategoricalEncoder(top_categories=None,
                                       variables=None,
                                       drop_last=False)
    X = encoder.fit_transform(dataframe_enc_big)

    # init params
    assert encoder.top_categories is None
    assert encoder.variables == ['var_A', 'var_B', 'var_C']
    assert encoder.drop_last == False
    # fit params
    transf = {
        'var_A_A': 6,
        'var_A_B': 10,
        'var_A_C': 4,
        'var_A_D': 10,
        'var_A_E': 2,
        'var_A_F': 2,
        'var_A_G': 6,
        'var_B_A': 10,
        'var_B_B': 6,
        'var_B_C': 4,
        'var_B_D': 10,
        'var_B_E': 2,
        'var_B_F': 2,
        'var_B_G': 6,
        'var_C_A': 4,
        'var_C_B': 6,
        'var_C_C': 10,
        'var_C_D': 10,
        'var_C_E': 2,
        'var_C_F': 2,
        'var_C_G': 6
    }

    assert encoder.input_shape_ == (40, 3)
    # transform params
    assert X.sum().to_dict() == transf
    assert 'var_A' not in X.columns

    # test case 2: encode all categories into k-1 binary variables, pass list of variables
    encoder = OneHotCategoricalEncoder(top_categories=None,
                                       variables=['var_A', 'var_B'],
                                       drop_last=True)
    X = encoder.fit_transform(dataframe_enc_big)

    # init params
    assert encoder.top_categories is None
    assert encoder.variables == ['var_A', 'var_B']
    assert encoder.drop_last == True
    # fit params
    transf = {
        'var_A_A': 6,
        'var_A_B': 10,
        'var_A_C': 4,
        'var_A_D': 10,
        'var_A_E': 2,
        'var_A_F': 2,
        'var_B_A': 10,
        'var_B_B': 6,
        'var_B_C': 4,
        'var_B_D': 10,
        'var_B_E': 2,
        'var_B_F': 2
    }
    assert encoder.input_shape_ == (40, 3)
    # transform params
    for col in transf.keys():
        assert X[col].sum() == transf[col]
    assert 'var_B' not in X.columns
    assert 'var_B_G' not in X.columns
    assert 'var_C' in X.columns

    # test case 3: encode only the most popular categories
    encoder = OneHotCategoricalEncoder(top_categories=4,
                                       variables=None,
                                       drop_last=False)
    X = encoder.fit_transform(dataframe_enc_big)

    # init params
    assert encoder.top_categories == 4
    # fit params
    transf = {
        'var_A_D': 10,
        'var_A_B': 10,
        'var_A_A': 6,
        'var_A_G': 6,
        'var_B_A': 10,
        'var_B_D': 10,
        'var_B_G': 6,
        'var_B_B': 6,
        'var_C_D': 10,
        'var_C_C': 10,
        'var_C_G': 6,
        'var_C_B': 6
    }

    assert encoder.input_shape_ == (40, 3)
    # transform params
    for col in transf.keys():
        assert X[col].sum() == transf[col]
    assert 'var_B' not in X.columns
    assert 'var_B_F' not in X.columns

    with pytest.raises(ValueError):
        OneHotCategoricalEncoder(top_categories=0.5)

    with pytest.raises(ValueError):
        OneHotCategoricalEncoder(drop_last=0.5)

    # test case 4: when dataset contains na, fit method
    with pytest.raises(ValueError):
        encoder = OneHotCategoricalEncoder()
        encoder.fit(dataframe_enc_big_na)

    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = OneHotCategoricalEncoder()
        encoder.fit(dataframe_enc_big)
        encoder.transform(dataframe_enc_big_na)
Example #10
0
cat_avg = neighborhoods.groupby('neighborhood_name')[[
    'SchoolQuality', 'CrimeIndex'
]].agg(lambda x: x.value_counts().index[0])

#Create a new column that has sale price written in proper form
avg['SalePrice'] = avg['SalePrice'].apply(lambda x: '${:,.2f}'.format(x))

#Merge the first dataframe with the dataframe that includes the geolocation attributes
new_avg = pd.merge(avg,
                   hoods[['neighborhood_name', 'latitude', 'longitude']],
                   on='neighborhood_name',
                   how='left')

#Merge the previously formed dataframe with the categorical dataframe
new_avg = pd.merge(new_avg, cat_avg, on='neighborhood_name', how='left')

#Remove observations with missing values
new_avg = new_avg.dropna()

#One Hot encode the categorical variables in order to prepare it for the app
ohe_encoder = OneHotCategoricalEncoder(
    variables=['CrimeIndex', 'SchoolQuality'])
ohe_encoder.fit(new_avg)
new_avg = ohe_encoder.transform(new_avg)

#Accounting for a rare label
if 'CrimeIndex_Very High' not in new_avg.columns:
    new_avg['CrimeIndex_Very High'] = 0

#Save the Neighborhood DataFrame
new_avg.to_csv('Data/Neighborhoods_final.csv', index=False)
Example #11
0
        for alpha, tree in zip(self.alphas, self.models):
            Fx += alpha * tree.predict(X)
        return np.sign(Fx), Fx

    def score(self, X, Y):
        P, Fx = self.predict(X)
        L = np.exp(-Y * Fx).mean()
        return np.mean(P == Y), L


if __name__ == '__main__':
    df = pd.read_csv('./data/mushroom.data', header=None)
    df[0] = df.apply(lambda v: 0 if v[0] == 'e' else 1, axis=1)
    X = df.drop(0, axis=1)
    Y = df[0].values
    encoder = OneHotCategoricalEncoder()
    X = encoder.fit_transform(X)

    Y[Y == 0] = -1  # make the targets -1,+1
    Ntrain = int(0.8 * len(X))
    Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
    Xtest, Ytest = X[Ntrain:], Y[Ntrain:]

    T = 200
    train_errors = np.empty(T)
    test_losses = np.empty(T)
    test_errors = np.empty(T)
    for num_trees in range(T):
        if num_trees == 0:
            train_errors[num_trees] = None
            test_errors[num_trees] = None
df = df.replace("unknown", np.NaN)

#drop rows where missing values present in Job column
df = df.dropna(subset=['job'])

print("Final null value count: \n", df.isnull().sum())

#Replace NaN values with "unknown" to do further procedure
df = df.replace(np.NaN, "unknown")
df.to_csv("E:\\Nayan\\Sesh\\df_wo_missing_values.csv", index = False)
print(df.isnull().sum())

#Make X  matrix and y vector

X = df.iloc[:, :-1]


#extract only categorical columns
X_categorical = X.select_dtypes(include=[object])
print(X_categorical.head())

from feature_engine.categorical_encoders import OneHotCategoricalEncoder
one_enc = OneHotCategoricalEncoder(top_categories=None, variables=list(X_categorical), drop_last=True)
one_enc.fit(X)
X_transformed = one_enc.transform(X)
print(X_transformed.columns)
print(len(X_transformed.columns))
print(X_transformed.head())

X_transformed.to_csv("E:\\Nayan\\Sesh\\Processed_data.csv", index = False)