Exemple #1
0
def do_cat_bin(X, X_test, cols):
    be = BinaryEncoder(cols=cols).fit(X[cols])
    X_tr = be.transform(X[cols])
    X_te = be.transform(X_test[cols])
    new_cols = list(X_tr.columns)
    print(f'do_cat_bin: Done. Added {len(new_cols)} new columns.')
    return X_tr, X_te, new_cols
Exemple #2
0
def fit_binary(input_df: pd.DataFrame, cols: List[str], na_value: Any = None):
    """
    Creates the binary encoder by fitting it through the given DataFrame.
    NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value.
    Args:
        input_df: DataFrame used to fit the encoder
        cols: List of categorical columns to be encoded
        na_value: Default null value for DataFrame

    Returns:
        result_df: encoded input_df DataFrame
        model : encoder model to be passed to `transform_binary` method
    """
    df = input_df.copy()

    if na_value is not None:
        for col in cols:
            df[col] = df[col].replace({na_value: np.nan})

    encoder = BinaryEncoder(cols=cols, drop_invariant=True)
    encoder = encoder.fit(df)
    for idx in range(len(encoder.base_n_encoder.ordinal_encoder.mapping)):
        encoder.base_n_encoder.ordinal_encoder.mapping[idx]["mapping"].loc[
            np.nan] = -2

    result_df = encoder.transform(df)

    model = {"encoder": encoder, "cols": cols, "na_value": na_value}
    return result_df, model
def encode_high_cardinality_categorical_df(dataframe, fit=False):
   """
    Encode high cardinality categorical features using Binary Encoding and dropping invariant features
    In Binary Encoding, features are converted to a binary representation and binary digits are used as new
    features.
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), high card. categorical features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = BinaryEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'high_card_categorical_encoder')
    else:
        encoder = unpickle_obj('high_card_categorical_encoder')

    # transform data
    return encoder.transform(dataframe)
Exemple #4
0
def to_categorical(
    training_data: pd.DataFrame, test_data: pd.DataFrame
) -> (pd.DataFrame, pd.DataFrame):

    categorical_columns_list = list(training_data.columns[training_data.dtypes==object])
    ce_be = BinaryEncoder(cols=categorical_columns_list, handle_unknown="inpute")
    training_data_ce_binary = ce_be.fit_transform(training_data)
    test_data_ce_binary = ce_be.transform(test_data)

    return dict(train_data_categorical=training_data_ce_binary,
                test_data_categorical=test_data_ce_binary)
class DFBinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = BinaryEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y=None):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = self.model.transform(X[self.transform_cols])
        new_X[new_X.columns] = new_X[new_X.columns].astype('int8')

        new_X = pd.concat([X, new_X], axis=1)
        new_X.drop(columns=self.transform_cols, inplace=True)

        return new_X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def inverse_transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        columns = [
            x for x in X.columns
            if any([y for y in self.transform_cols if x.startswith(f'{y}_')])
        ]
        new_X = self.model.inverse_transform(X[columns])

        new_X = pd.concat([X, new_X], axis=1)
        new_X.drop(columns=columns, inplace=True)

        return new_X
class df_BinaryEncoder(TransformerMixin):
    """
    Use for encoding nominal features
    Parameters
    ----------
    handle_unknown: str, default='ignore'
    ----------
    """
    def __init__(self, handle_unknown='ignore'):
        self.handle_unknown = handle_unknown
        
    def fit(self, X, y=None):
        self.enc = BinaryEncoder(handle_unknown=self.handle_unknown)
        self.enc.fit(X)
        return self
    
    def transform(self, X):
        return self.enc.transform(X)
Exemple #7
0
def predict():
    '''
    For rendering results on HTML GUI
    '''
    features = [x for x in request.form.values()]
    #final_features = [np.array(int_features)]
    #prediction = model.predict(final_features)

    #output = round(prediction[0], 2)

    features = np.array(features)
    features = features.reshape(1, 6)
    features = pd.DataFrame(data=features,
                            columns=[
                                'Name', 'Genre', 'Comments', 'Likes',
                                'Popularity', 'Followers'
                            ])
    df = pd.read_csv('data.csv')
    cv = {'Comments': int, 'Likes': int, 'Popularity': int, 'Followers': int}
    df = df.astype(cv)
    features = features.astype(cv)
    #x=df[df['Views']==0].index

    df.drop(index=df[df['Views'] < df['Likes']].index, axis=1, inplace=True)
    df.drop(index=df[df['Views'] < df['Comments']].index, axis=1, inplace=True)
    df.drop(index=df[df['Views'] < df['Popularity']].index,
            axis=1,
            inplace=True)

    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
    df = df[~((df < (Q1 - 3 * IQR)) | (df > (Q3 + 3 * IQR))).any(axis=1)]

    df = df.drop(
        columns=['Unique_ID', 'Country', 'Song_Name', 'Timestamp', 'index'])

    y = df['Views']
    df = df.drop(columns=['Views'])

    be = BinaryEncoder()
    df = be.fit_transform(df)
    f = be.transform(features)

    X = df.iloc[:, :]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    rg1 = AdaBoostRegressor()
    rg1.fit(X_train, y_train)
    #ypred=rg1.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg2 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1)
    # para={'n_estimators':[250,300],'learning_rate':[1,0.1,0.01]}
    # grid=GridSearchCV(estimator=rg8,param_grid=para,verbose=1,cv=10,n_jobs=-1)
    rg2.fit(X_train, y_train)
    #ypred=rg2.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg3 = RandomForestRegressor(random_state=0, n_estimators=20, max_depth=15)
    # para={'n_estimators':[5,10,30,20],'max_depth':[5,8,20,17]}
    # grid=GridSearchCV(estimator=rg9,param_grid=para,cv=10,verbose=1,n_jobs=-1)
    rg3.fit(X_train, y_train)
    #ypred=rg3.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg6 = StackingRegressor([rg1, rg2], meta_regressor=rg3)
    rg6.fit(X_train, y_train)
    #ypred=rg6.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))
    f = f.iloc[:, :]
    y_pred = rg6.predict(f)

    y_pred = y_pred.astype(int)

    return render_template(
        'index.html', prediction_text='Numberb of Views is {}'.format(y_pred))
class BinaryEncoder():
    """Maps each categorical value to several columns using binary encoding.

    Parameters:
        cols: [str]
            list of column names to encode.
    """
    name = 'binary'

    def __init__(self, cols=None):
        self.encoder = Binary(cols=cols)

    def fit(self, X, features, y=None):
        """Fits encoder to data table.
        returns self.
        """
        self.encoder.fit(X, y)
        self.features = self.encode_features_list(X, features)
        return self

    def transform(self, X):
        """Encodes matrix and updates features accordingly.
        returns encoded matrix (dataframe).
        """
        X_new = self.encoder.transform(X)
        feature_names = []
        for feature in self.features:
            for fname in feature.get_feature_names():
                feature_names.append(fname)
        X_new.columns = feature_names

        return X_new

    def fit_transform(self, X, features, y=None):
        """First fits, then transforms matrix.
        returns encoded matrix (dataframe).
        """
        return self.fit(X, features, y).transform(X)

    def get_mapping(self, category):
        """Gets the mapping for the binary encoder and underlying ordinal encoder.
        returns tuple (binary_encoder_mapping, ordinal_encoder_mapping).
        """
        def mapping_helper(method, category):
            if isinstance(category, str):
                for map in method.mapping:
                    if map['col'] == category:
                        return map['mapping']
            return method.mapping[category]['mapping']

        return mapping_helper(self.encoder.base_n_encoder, category), \
            mapping_helper(self.encoder.base_n_encoder.ordinal_encoder, category)

    def encode_features_list(self, X, features):
        feature_list = []
        index = 0
        for f in features:
            if f.get_name() in self.encoder.base_n_encoder.cols:
                f = ft.Feature([f], primitive=BinaryEnc(self, index))
                index += 1
            feature_list.append(f)
        return feature_list

    def get_features(self):
        return self.features

    def get_name(self):
        return self.name
Exemple #9
0
def doCleanupEncode(X,
                    y=None,
                    cat=None,
                    oh=None,
                    binary=None,
                    loo=None,
                    woe=None,
                    lp_cols=None,
                    NoData=True):
    from enrich import replaceCVs
    from enrich import one_hot_encode
    from category_encoders import BinaryEncoder
    from category_encoders import OneHotEncoder
    from category_encoders import WOEEncoder
    from category_encoders import LeaveOneOutEncoder

    if NoData is False:
        if cat is not None | oh is not None:
            # translate associated columns' null, NaN, blank and 9 values to zero
            X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0)

    if oh is not None:
        if NoData:
            ec = OneHotEncoder(cols=oh,
                               use_cat_names=True,
                               return_df=True,
                               handle_unknown='indicator',
                               handle_missing='indicator').fit(X)
            X = ec.fit_transform(X)
            # dropping these columns did not help performance
            # for o in oh:
            #    stem = o.split("_")[1]
            #    d1 = "L_" + stem + "_-1"
            #    d2 = "L_" + stem + "_nan"
            #    print("DROPPING ", d1, " ", d2, "\n")
            #    X.drop(d1, axis=1, errors='ignore', inplace=True)
            #    X.drop(d2, axis=1, errors='ignore', inplace=True)
        else:
            # one-hot encode, then drop 0 if created
            for oh_c in oh:
                X = one_hot_encode(X, oh_c, False)
                X.drop(0, axis=1, errors='ignore', inplace=True)

    if binary is not None:
        # binary encode binary columns
        if NoData:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True,
                                handle_unknown='indicator').fit(X)
            X = enc.transform(X)
        else:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True).fit(X)
            X = enc.transform(X)

    if woe is not None:
        # use weight of evidence on woe columns
        for w in woe:
            X[w] = X[w].fillna('NoData')

        wenc = WOEEncoder(cols=woe).fit(X, y)
        X = wenc.transform(X).round(2)

    if loo is not None:
        # use leave one out on loo columns
        for l in loo:
            X[l] = X[l].fillna('NoData')

        lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y)
        X = lenc.transform(X).round(2)

    # Cast all to int64
    # X = X.astype("int64")

    if lp_cols is not None:
        # drop least predictive
        X.drop(lp_cols, axis=1, errors="ignore", inplace=True)

    X.reset_index(drop=True, inplace=True)
    return X
def main():
    import psutil

    # import matplotlib.pyplot as plt
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    # from sklearn.preprocessing import StandardScaler
    # from sklearn.naive_bayes import GaussianNB
    from sklearn.preprocessing import Imputer
    from sklearn.externals import joblib
    # from sklearn import metrics
    from category_encoders import BinaryEncoder
    from datetime import datetime

    from sklearn.model_selection import TimeSeriesSplit
    import os
    import numpy as np

    import sys
    sys.path.append("../")
    import serial_preprocess_data as preprocess
    import utils

    cpu_count = int(psutil.cpu_count() / 4) - 2
    print("Trying to use {} number of cpu".format(cpu_count))
    data_dir = "../../data/"
    hdf_files = sorted([data_dir + file for file in os.listdir(data_dir)
                        if '.h5' in file])

    columns = ['Year',
               'Cancelled',
               'Distance',
               'Diverted',
               'ArrTime',
               'Dest',
               'FlightNum',
               # 'DepDelay',  ## not using DepDelay
               'ActualElapsedTime',
               'ArrDelay',
               'DayofMonth',
               'UniqueCarrier',
               'Month',
               'DepTime',
               'Origin',
               'DayOfWeek'
               ]
    scoring = 'roc_auc'
    no_of_files = 12

    df = preprocess.readFilesToDf("h5", file_list=hdf_files[:no_of_files],
                                  cols=columns)

    print("Size of file read in is {0:.2f} GB".format(
          utils.getFileSizeInGB(hdf_files[:no_of_files])))
    print("Reading in {0} selected columns only".format(len(columns)))
    print("Columns are:", columns)
    print("Memory usage of the data frame is {0:.2f} GB".format(
          np.sum(df.memory_usage()) / 1e9))

    # preprocess data check the percentage of nans
    _ = preprocess.find_cardinality_of_categorical_variables(df)

    ix = preprocess.clean_data_minimally(df)
    # apply cleaning of the data
    df = df.iloc[ix].reindex()
    df = df.sort_values(by=['DayofMonth', 'Month', 'Year', 'DepTime'])

    feature_cols = list(df.columns)
    feature_cols.remove('ArrDelay')
    feature_cols.remove('Cancelled')

    df['delayCat'] = df.ArrDelay.apply(
        preprocess.convert_delay_into_multiple_categories)
    df['delayBinaryCat'] = df.ArrDelay.apply(
        preprocess.convert_delay_into_two_categories)
    X = df[feature_cols]
    y = df['delayBinaryCat']

    encoder = BinaryEncoder()
    encoder.fit(X)
    transformed_X = encoder.transform(X)

    print("Transformed columns are ", transformed_X.columns)

    df_gpby = df.groupby('delayCat')
    delay_percentage_breakdown = df_gpby.ArrDelay.count() / df.shape[0] * 100
    delay_percentage_breakdown.index = ['very early',
                                        'early',
                                        'on time',
                                        'late',
                                        'very late'
                                        ]
    print("Percentage breakdown of different categories " +
          "of the target variable is: \n",
          delay_percentage_breakdown)

    # the breakdown of delay is pretty balanced.
    # Although a careful study will also look at the correlation with other
    # other features

    tscv = TimeSeriesSplit()
    # cv_ixes = [(train_ix, test_ix)
    #            for train_ix, test_ix in tscv.split(transformed_X)]

    # only put grid search steps into pipeline
    rf_pipeline_steps = [
        # impute missing feature values with median values
        ("imputer", Imputer(strategy="median")),
        ('rf', RandomForestClassifier(n_jobs=cpu_count, oob_score=True)),
    ]

    gridsearch_parameters = dict([
        ("rf__n_estimators", [800]),
        ("rf__max_features", [None]),  # not many featuers to subset from
    ])

    rf_pipeline = Pipeline(rf_pipeline_steps)

    est = GridSearchCV(rf_pipeline,
                       param_grid=gridsearch_parameters,
                       n_jobs=1,
                       scoring=scoring,
                       cv=tscv.split(X),  # this does 3 fold cross-validation
                       )
    print("Fitting the values")
    print("Columns in the training data are ", X.columns)
    est.fit(transformed_X.values, y.values)
    print("Saving the model")
    print("Best score" + scoring + "is", est.best_score_)
    print("Best parameters are ", est.best_params_)

    datetime_stamp = datetime.now().strftime(
        "%D_%X").replace("/", "_").replace(":", "_")
    joblib.dump(est.best_estimator_,
                "./RF_CV_pipeline_" + datetime_stamp + ".pkl")
                        nrows=500)
    test = pd.read_csv(os.path.join(config["input_path"], "test.csv"),
                       na_values=-1,
                       nrows=500)

    train_feature, train_label = train.iloc[:,
                                            2:].copy(), train.iloc[:,
                                                                   1].copy()
    test_feature = test.iloc[:, 1:].copy()
    del train, test

    train_feature = train_feature[[
        col for col in train_feature.columns if not col.startswith("ps_calc_")
    ]]
    test_feature = test_feature[train_feature.columns]

    ncs = [
        col for col in train_feature.columns
        if not col.endswith(("_bin", "_cat"))
    ]
    ccs = [
        col for col in train_feature.columns if col.endswith(("_bin", "_cat"))
    ]

    eet = EntityEmbeddingTree(numeric_columns=ncs, categorical_columns=ccs)
    eet.fit(X=train_feature, y=train_label)

    encoder = BinaryEncoder()
    print(encoder.fit_transform(eet.transform(X=train_feature)).shape)
    print(encoder.transform(eet.transform(X=test_feature)).shape)