def test_binary(self):
        """

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        X = self.create_dataset(n_rows=1000)
        X_t = self.create_dataset(n_rows=100)

        enc = encoders.BinaryEncoder(verbose=1, cols=cols)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BinaryEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BinaryEncoder(verbose=1, drop_invariant=True)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BinaryEncoder(verbose=1, return_df=False)
        enc.fit(X, None)
        self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
Esempio n. 2
0
def encoder(data):

###### BINARY  ENCODER ######'
    B_encoder = category_encoders.BinaryEncoder(cols=['premis_var'])
    data = B_encoder.fit_transform(data)
    B_encoder = category_encoders.BinaryEncoder(cols=['incdt_time'])
    data = B_encoder.fit_transform(data)
    B_encoder = category_encoders.BinaryEncoder(cols=['incdt_date'])
    data = B_encoder.fit_transform(data)

###### DATA STANDARDISATION ######
#    sc = StandardScaler()
#    data['incdt_time'] = sc.fit_transform(data['incdt_time'].values.reshape(-1,1))
#    data['incdt_date'] = sc.fit_transform(data['incdt_date'].values.reshape(-1,1))

###### ONE HOT ENCODER ######
    OH_encoder = OneHotEncoder()
    hc1 = DataFrame(OH_encoder.fit_transform(data['BORO_NM'].values.reshape(-1,1)).toarray(),
                    columns = ['BORO1', 'BORO2','BORO3', 'BORO4', 'BORO5'])
    hc2 = DataFrame(OH_encoder.fit_transform(data['VIC_AGE_GROUP'].values.reshape(-1, 1)).toarray(),
                    columns=['VIC_AGE1', 'VIC_AGE2', 'VIC_AGE3', 'VIC_AGE4', 'VIC_AGE5'])
    hc3 = DataFrame(OH_encoder.fit_transform(data['LOC_OF_OCCUR_DESC'].values.reshape(-1, 1)).toarray(),
                    columns=['LOC_DESC1', 'LOC_DESC2', 'LOC_DESC3', 'LOC_DESC4'])
    hc4 = DataFrame(OH_encoder.fit_transform(data['VIC_RACE'].values.reshape(-1, 1)).toarray(),
                    columns=['VIC_RACE1', 'VIC_RACE2', 'VIC_RACE3', 'VIC_RACE4'])
    hc5 = DataFrame(OH_encoder.fit_transform(data['VIC_SEX'].values.reshape(-1, 1)).toarray(),
                    columns=['VICM_SEX1', 'VICM_SEX2'])
    hc6 = DataFrame(OH_encoder.fit_transform(data['SUSP_RACE'].values.reshape(-1, 1)).toarray(),
                    columns=['SUSP_RACE1', 'SUSP_RACE2', 'SUSP_RACE3', 'SUSP_RACE4'])
    hc7 = DataFrame(OH_encoder.fit_transform(data['VIC_SEX'].values.reshape(-1, 1)).toarray(),
                    columns=['SUSP_SEX1', 'SUSP_SEX2'])

    data = pd.concat([data,hc1,hc2,hc3,hc4,hc5,hc6,hc7], axis=1)
    return data
Esempio n. 3
0
def binary_encoding(df, cols, handle_nan=True):
    if handle_nan:
        encoder = ce.BinaryEncoder(cols=cols,
                                   handle_unknown='indicator',
                                   handle_missing='indicator')
    else:
        encoder = ce.BinaryEncoder(cols=cols,
                                   handle_unknown='return_nan',
                                   handle_missing='return_nan')
    df_new = encoder.fit_transform(df)
    return df_new
Esempio n. 4
0
def process_data(data_df: pd.DataFrame) -> pd.DataFrame:
    encoder = ce.BinaryEncoder(cols=['b', 'm', 'd', 'p', 'l', 's'],
                               return_df=True)
    data_encoded_df = encoder.fit_transform(data_df)

    print(data_encoded_df)
    return data_encoded_df
def BinaryEncodingCalc(request, fName):
    df = get_df(fName)
    if request.method == 'POST':
        selected_cols = request.POST.getlist('binaryCol')
        for selected_col in selected_cols:
            encoder = ce.BinaryEncoder(cols=[selected_col])
            df = encoder.fit_transform(df)
            df.to_csv(os.path.join(settings.MEDIA_ROOT,
                                   'processed/' + fName + '.csv'),
                      index=False)
        df_new = get_df(fName)
        clm_list = list(df_new)
        NaN_percent = get_NaN_percent(fName)
        binary_list = []
        for clm in clm_list:
            dt = df_new[clm].dtype
            if dt == 'int64' or dt == 'float64':
                pass
            else:
                binary_list.append(clm)

        binaryProcessed_list = selected_cols

        context = {
            'fName': fName,
            'processing_list': binary_list,
            'processed_list': binaryProcessed_list,
            'NaN_percent': NaN_percent,
            'status': 'Success',
            'message': 'Binary Encoding was done on selected features.'
        }
        return render(request, 'BinaryEncoding.html', context)
Esempio n. 6
0
def create_binary(df, col):
    assert isinstance(col, str)

    uniques = pd.DataFrame(df[col].unique(), columns=[col])
    enc = ce.BinaryEncoder(verbose=1, cols=[col])
    uniques = pd.concat([uniques, enc.fit_transform(uniques)], axis=1)
    return df.merge(uniques, how='left', on=col)
Esempio n. 7
0
def apply_binary_encoding(df, categorical_columns):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.BinaryEncoder(cols=categorical_columns).fit(df.values)
    X_transformed = encoder.transform(df)
    return X_transformed
Esempio n. 8
0
def encode_categorical(given_dataset):
    """Street,CentralAir -> binary feature encoding"""
    
    """LotShape,LandContour,LandSlope,BldgType,HouseStyle,ExterQual,ExterCond,Foundation,BsmtQual
    ,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,Electrical
    ,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual
    ,GarageCond,PoolQC,Fence-> ordinal feature encoding """
    
    """MSZoning,Alley,Utilities,LotConfig,Neighborhood,Condition1,Condition2,RoofStyle,RoofMatl,Exterior1st
    ,Exterior2nd,MasVnrType,Heating,Functional,PavedDrive,MiscFeature,SaleType
    ,SaleCondition -> nominal feature"""
    
    encoder1 = ce.BinaryEncoder(cols=['Street','CentralAir'], return_df=True)
    encoder2 = ce.OrdinalEncoder(cols=['MSZoning','Alley','LandSlope','BldgType','HouseStyle','ExterQual'
                                       ,'ExterCond','Foundation','BsmtQual','BsmtCond'
                                       ,'BsmtExposure','BsmtFinType1','BsmtFinType2'
                                       ,'HeatingQC','Electrical','KitchenQual'
                                       ,'FireplaceQu','GarageType','GarageFinish'
                                       ,'GarageQual','GarageCond','PoolQC','Fence'], return_df=True)
    encoder3 = ce.OneHotEncoder(cols=['LotShape','LandContour','Utilities','LotConfig','Neighborhood'
                                      ,'Condition1','Condition2','RoofStyle','RoofMatl','Exterior1st'
                                      ,'Exterior2nd','MasVnrType','Heating','PavedDrive'
                                      ,'MiscFeature','SaleType','SaleCondition','Functional'], return_df=True)

    # Assume our loan data has been imported as df already
    given_dataset = encoder1.fit_transform(given_dataset)
    given_dataset = encoder2.fit_transform(given_dataset)
    given_dataset = encoder3.fit_transform(given_dataset)
    return given_dataset
Esempio n. 9
0
def general_imputer(df):
    """
    general imputer + normalizer and binarizer
    
    """

    nulls_per_column = df.isnull().sum()
    print(nulls_per_column.loc[nulls_per_column > 0])

    # Create a boolean mask for categorical columns
    categorical_feature_mask = df.dtypes == object

    # Get list of categorical column names
    feat_cat = df.columns[categorical_feature_mask].tolist()
    # Get list of non-categorical column names
    feat_cont = df.columns[~categorical_feature_mask].tolist()

    mapper = DataFrameMapper(
        [([nf],
          Pipeline(steps=[('imp', SimpleImputer(
              strategy="median")), ('std', StandardScaler())]))
         for nf in feat_cont] + [(cf,
                                  Pipeline(steps=[
                                      ('imp', CategoricalImputer()),
                                      ('label', ce.BinaryEncoder()),
                                  ])) for cf in feat_cat],
        input_df=True,
        df_out=True,
        default=False)

    return (mapper)
    def test_inverse_transform_11(self):
        """
        Test binary encoding
        """
        train = pd.DataFrame({
            'city': ['chicago', 'paris'],
            'state': ['US', 'FR'],
            'other': ['A', np.nan]
        })

        test = pd.DataFrame({
            'city': ['chicago', 'paris', 'monaco'],
            'state': ['US', 'FR', 'FR'],
            'other': ['A', np.nan, 'B']
        })

        expected = pd.DataFrame({
            'city': ['chicago', 'paris', np.nan],
            'state': ['US', 'FR', 'FR'],
            'other': ['A', np.nan, 'B']
        })

        enc = ce.BinaryEncoder(cols=['city', 'state']).fit(train)
        result = enc.transform(test)
        original = inverse_transform(result, enc)
        pd.testing.assert_frame_equal(original, expected)
def featureEncodingUsingBinaryEncoder(dataSetForFeatureEncoding):
    print("****** Start binary encoding on the categorical features in the given dataset *****")

    labelName = getLabelName()
    #Extract the categorical features, leave the label
    categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop([labelName],axis=1).select_dtypes(['object'])
    #Get the names of the categorical features
    categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values
 
    print("****** Number of features before binary encoding: ",len(dataSetForFeatureEncoding.columns))
    print("****** Number of categorical features in the dataset: ",len(categoricalColumnNames))
    print("****** Categorical feature names in the dataset: ",categoricalColumnNames)

    print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n')
    label = dataSetForFeatureEncoding.drop(dataSetForFeatureEncoding.loc[:, ~dataSetForFeatureEncoding.columns.isin([labelName])].columns, axis = 1)
    for feature in categoricalColumnNames:
        uniq = np.unique(dataSetForFeatureEncoding[feature])
        print('\n{}: {} '.format(feature,len(uniq)))
        printList(dataSetForFeatureEncoding[feature].unique(),'distinct values')
        featureColumns = dataSetForFeatureEncoding.drop(dataSetForFeatureEncoding.loc[:, ~dataSetForFeatureEncoding.columns.isin([feature])].columns, axis = 1)
        binaryEncoder = ce.BinaryEncoder(cols = [feature])
        binaryEncodedFeature = binaryEncoder.fit_transform(featureColumns, label)
        dataSetForFeatureEncoding = dataSetForFeatureEncoding.join(binaryEncodedFeature)
        dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(feature, axis=1)

    dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(labelName, axis=1)
    dataSetForFeatureEncoding[labelName] = label
    print("****** Number of features after binary encoding: ",len(dataSetForFeatureEncoding.columns))    
    
    print("****** End binary encoding on the categorical features in the given dataset *****\n")
    return dataSetForFeatureEncoding
Esempio n. 12
0
def processingPreModelling(df, catVarsDict={}):

    # Data Processing before Modelling

    # Bools to int
    df.isMobile = df.isMobile.astype(int)

    # Date objects to datetime format
    df.date = pd.to_datetime(df.date, format='%Y-%m-%d')

    for var, value in catVarsDict.items():

        if (value == 'BinaryEncoder'):
            encoder = category_encoders.BinaryEncoder(cols=[var],
                                                      drop_invariant=True,
                                                      return_df=True)
            df = encoder.fit_transform(df)
        elif (value == 'LabelEncoder'):
            df[var] = LabelEncoder().fit_transform(df[var])
        elif (value == 'OneHot'):
            encoder = category_encoders.one_hot.OneHotEncoder(
                cols=[var],
                drop_invariant=True,
                return_df=True,
                use_cat_names=True)
            df = encoder.fit_transform(df)

    return df
Esempio n. 13
0
def binary_encoding():
    source_bin_df = data_df
    encoder = ce.BinaryEncoder(cols=['decision'], drop_invariant=True)
    dfb = encoder.fit_transform(source_bin_df['decision'])
    source_bin_df = pd.concat([source_bin_df, dfb], axis=1)

    print(source_bin_df)
Esempio n. 14
0
def encode_vars_via_lookup(fset, feature_lookup):
    # import category_encoders as ce

    for var in fset:

        encoder = None
        vtype = feature_lookup.get(var, 'numeric')  # default numeric

        if vtype == 'ord':
            encoder = ce.OrdinalEncoder(cols=[
                var,
            ])
        elif vtype == 'cat':
            encoder = ce.OneHotEncoder(cols=[
                var,
            ])
        elif vtype in ('str', 'high_card'
                       ):  # high_card: categorical but with high cardinality
            encoder = ce.BinaryEncoder(cols=[
                var,
            ])  # ... or use ce.HashingEncoder()
        else:
            # assuming that the var is numeric
            pass

        # data imputation [todo]

        if encoder is not None:
            dfX = encoder.fit_transform(dfX, dfy)
    return dfX
    def test_inv_transform_ct_9(self):
        """
        test inv_transform_ct with Binary Encoder and passthrough option
        """
        y = pd.DataFrame(data=[0, 1], columns=['y'])

        train = pd.DataFrame({'city': ['chicago', 'paris'],
                              'state': ['US', 'FR'],
                              'other': ['A', 'B']})

        enc = ColumnTransformer(
            transformers=[
                ('binary', ce.BinaryEncoder(), ['city', 'state'])
            ],
            remainder='passthrough')
        enc.fit(train, y)
        test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'],
                             'state': ['US', 'FR', 'FR'],
                             'other': ['A', 'B', 'C']})

        expected = pd.DataFrame({'binary_city': ['chicago', 'chicago', 'paris'],
                                 'binary_state': ['US', 'FR', 'FR'],
                                 'other': ['A', 'B', 'C']})

        result = pd.DataFrame(enc.transform(test))
        result.columns = ['col1_0', 'col1_1', 'col2_0', 'col2_1', 'other']
        original = inverse_transform(result, enc)
        pd.testing.assert_frame_equal(original, expected)
Esempio n. 16
0
def dicho_nominales(data, cols=['var1', 'var2']):
    '''
    # Dichotomise les variables catégorielles nominales 
    '''
    enc = ce.BinaryEncoder(cols=cols)
    data_drop = enc.fit_transform(data)

    return (data_drop)
Esempio n. 17
0
def binary():
    X, _, _ = get_mushroom_data()
    print(X.info())
    enc = ce.BinaryEncoder()
    enc.fit(X, None)
    out = enc.transform(X)
    print(out.info())
    del enc, _, X, out
Esempio n. 18
0
 def fit(self, X, y=None):
     # we need the fit statement to accomodate the sklearn pipeline
     des_nom_DF = X[self.descreteVars_Nominal]
     # Map Nominal Categorical data to Numerical
     cat_nom_DF = des_nom_DF.fillna('NULL').astype(str)
     self.ce_binary = ce.BinaryEncoder()
     self.ce_binary.fit(cat_nom_DF)
     return self
 def test_inverse_transform_ce_binary(self):
     """
     Unit test inverse transform ce binary
     """
     preprocessing = ce.BinaryEncoder(cols=['Age', 'Sex'], return_df=True)
     fitted_dataset = preprocessing.fit_transform(self.ds_titanic_clean)
     output = inverse_transform(fitted_dataset, preprocessing)
     pd.testing.assert_frame_equal(output, self.ds_titanic_clean)
Esempio n. 20
0
def encode_categorical_features(df, encoder=None):
    '''
        Encode the categorical features using BinaryEncoder.
    '''
    if encoder == None:
        encoder = ce.BinaryEncoder(
            cols=[location_key, class_key, subdepartment_key])
        encoder.fit(df)
    return encoder.transform(df), encoder
Esempio n. 21
0
 def create_features(self, df_train, df_test):
     encoder = ce.BinaryEncoder(cols=self.columns)
     encoder.fit(df_train[self.columns],
                 df_train[self.target_column].values.tolist())
     encoded_train = encoder.transform(df_train[self.columns])
     encoded_test = encoder.transform(df_test[self.columns])
     for column in encoded_train.columns:
         self.train[column + '_BinaryEncoder'] = encoded_train[column]
         self.test[column + '_BinaryEncoder'] = encoded_test[column]
Esempio n. 22
0
def load_and_clean(non_categorical,
                   categorical,
                   data_path="../data/with_stock_data_webclicks_linkedin.csv",
                   normalize=False,
                   binary_encode=False,
                   trend_features=True,
                   filter=False):

    frame = pd.read_csv(data_path)
    all_non_categorical = get_all_non_categorical()
    all_categorical = get_all_categorical()

    for non_c in non_categorical:
        if non_c not in all_non_categorical:
            raise ValueError("Non categorical {} is not valid".format(non_c))

    for c in categorical:
        if c not in all_categorical:
            raise ValueError("Categorical {} is not valid".format(non_c))

    X = pd.DataFrame({key: frame[key] for key in non_categorical})

    if trend_features:
        trends_data = google_trends_features()
        X = pd.concat([X, google_trends_features()], axis=1)

    y = frame[[Y]]
    for cat in categorical:
        new_cols = None
        if binary_encode:
            ce_binary = ce.BinaryEncoder(cols=[cat])
            new_cols = ce_binary.fit_transform(frame[cat])
        else:
            new_cols = pd.get_dummies(frame[cat], prefix='category')
        X = pd.concat([X, new_cols], axis=1)
    X = clean_data(X)
    remove_nan(X, non_categorical)

    if filter:
        indices = X["current employee estimate"] > 300
        X = X[indices]
        y = y[indices]

    if (normalize):
        X.loc[:, :] = preprocessing.StandardScaler().fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1)
    X_dev, X_test, y_dev, y_test = train_test_split(X_test,
                                                    y_test,
                                                    test_size=.5,
                                                    random_state=1)

    return (X_train, y_train, X_dev, y_dev, X_test, y_test)
Esempio n. 23
0
def encode_categorical(df,encoder_name='binary'):
    encoder_dic = {"one_hot":ce.OneHotEncoder(),
                   "feature_hashing":ce.HashingEncoder(n_components=32),
                   "binary":ce.BinaryEncoder(),
                   "ordinal":ce.OrdinalEncoder(),
                   "polynomial":ce.PolynomialEncoder()}
    encoder = encoder_dic.get(encoder_name)
    encoder.fit(df,verbose=1)
    df = encoder.transform(df)
    return df
Esempio n. 24
0
def main(params, inputs, outputs):
    columns_param = params.columns
    data = inputs.data
    data_new = outputs.data_new
    
    data_0 = pd.read_pickle(data)
    
    encoder = ce.BinaryEncoder(cols=[col for col in columns_param.split(",")])
    data_1 = encoder.fit_transform(data_0)
    
    data_1.to_pickle(data_new)
    def test_binary_np(self):
        """

        :return:
        """

        X = self.create_array(n_rows=1000)
        X_t = self.create_array(n_rows=100)

        enc = encoders.BinaryEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))
Esempio n. 26
0
def categorical_cols_train(data):
    try:
        print('Categorical Encoding')
        encoder = ce.BinaryEncoder(cols=[
            'employment_type', 'required_experience', 'required_education',
            'country'
        ])
        newdata = encoder.fit_transform(data)
        pickle.dump(encoder, open("model/encoder.p", "wb"))
        return newdata
    except Exception as e:
        handle('categorical column handling')
Esempio n. 27
0
    def preprocess(self) -> DataFrame:
        X_train = self.transaction_train_df_raw.drop('isFraud',
                                                     axis=1)  # 506691

        # num_test = 0.20
        # X_all = transaction_train_df_raw.drop('isFraud', axis=1)
        # Y_all = transaction_train_df_raw['isFraud']
        # X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all, test_size=num_test)

        # Preprocessing for numerical data
        numerical_transformer = SimpleImputer(strategy='mean')
        # Preprocessing for categorical data
        # to implment addr2, divide into 2 categories, then one hot enconding
        onehot_categorical_transformer = Pipeline(
            steps=[('imputer', SimpleImputer(
                strategy='most_frequent')), ('onehot', ce.OneHotEncoder())])

        binary_categorical_transformer = Pipeline(
            steps=[('imputer', SimpleImputer(strategy='most_frequent')
                    ), ('binary', ce.BinaryEncoder(drop_invariant=False))])

        ordinary_categorical_transformer = Pipeline(
            steps=[('imputer', SimpleImputer(
                strategy='mean')), ('ordinary', ce.OrdinalEncoder())])

        # Bundle preprocessing for numerical and categorical data
        preprocessor_pipeline = ColumnTransformer(transformers=[
            ('num', numerical_transformer, [
                'TransactionDT', 'TransactionAmt', 'C1', 'C2', 'C3', 'C4',
                'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                'C14', 'D1', 'D4', 'D10', 'D15'
            ]),
            ('onehot_cat', onehot_categorical_transformer,
             ['ProductCD', 'card4', 'M6']),
            #         ('onehot_cat', onehot_categorical_transformer, ['ProductCD', 'card4', 'card6', 'M6']),
            #        binary encoder did not work, should re implement
            #        ('binary_cat', binary_categorical_transformer, ['card3', 'card5', 'addr1', 'addr2']),
            ('binary_cat', binary_categorical_transformer, ['P_emaildomain']),
            ('ordinary_cat', ordinary_categorical_transformer,
             ['card1', 'card2'])
        ])

        # dataFrameMapper = DataFrameMapper([
        #         (['TransactionDT', 'TransactionAmt', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D4', 'D10', 'D15'], numerical_transformer),
        #         (['ProductCD', 'card4', 'M6'], onehot_categorical_transformer),
        #         #         ('onehot_cat', onehot_categorical_transformer, ['ProductCD', 'card4', 'card6', 'M6']),
        #         #        binary encoder did not work, should re implement
        #         #        ('binary_cat', binary_categorical_transformer, ['card3', 'card5', 'addr1', 'addr2']),
        #         (['P_emaildomain'], binary_categorical_transformer ),
        #         (['card1', 'card2'], ordinary_categorical_transformer)
        #     ]
        # )
        return preprocessor_pipeline.fit_transform(X_train)
    def test_binary(self):
        """

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        X = self.create_dataset(n_rows=1000)
        X_t = self.create_dataset(n_rows=100)

        enc = encoders.BinaryEncoder(verbose=1, cols=cols)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BinaryEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BinaryEncoder(verbose=1, drop_invariant=True)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BinaryEncoder(verbose=1, return_df=False)
        enc.fit(X, None)
        self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))

        # test inverse_transform
        X = self.create_dataset(n_rows=1000, has_none=False)
        X_t = self.create_dataset(n_rows=100, has_none=False)
        X_t_extra = self.create_dataset(n_rows=100,
                                        extras=True,
                                        has_none=False)

        enc = encoders.BinaryEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))
        self.verify_inverse_transform(
            X_t, enc.inverse_transform(enc.transform(X_t)))
        with self.assertRaises(ValueError):
            out = enc.inverse_transform(enc.transform(X_t_extra))
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
Esempio n. 30
0
def main():
    cat_columns = [GENDER, HAIR_COLOR, WEARS_GLASSES, PROFESSION, COUNTRY]
    training = read_data(
        "tcd ml 2019-20 income prediction training (with labels).csv")
    training = preprocess_data(training, True)
    # print(training.head())

    testing = read_data(
        "tcd ml 2019-20 income prediction test (without labels).csv")
    testing = preprocess_data(testing, False)

    training = pd.get_dummies(training,
                              columns=[GENDER, HAIR_COLOR, WEARS_GLASSES])

    ce_bin = ce.BinaryEncoder(cols=[PROFESSION, COUNTRY])
    training = ce_bin.fit_transform(training)

    cat_dummies = [
        col for col in training
        if "_" in col and col.split("_")[0] in cat_columns
    ]
    # print(training.dtypes)

    # print(cat_dummies)

    processed_cols = list(training.columns[:])

    # testing = pd.get_dummies(testing, columns=cat_columns)
    testing = pd.get_dummies(testing,
                             columns=[GENDER, HAIR_COLOR, WEARS_GLASSES])

    testing = ce_bin.fit_transform(testing)

    for col in testing.columns:
        if ("_" in col) and (col.split("_")[0]
                             in cat_columns) and col not in cat_dummies:
            print("Removing additional feature {}".format(col))
            testing.drop(col, axis=1, inplace=True)

    for col in cat_dummies:
        if col not in testing.columns:
            print("Adding missing feature {}".format(col))
            testing[col] = 0

    X = training.loc[:, training.columns != INCOME]
    X = X.loc[:, X.columns != "Instance"]

    y = training[INCOME]

    process_data(testing, training)