예제 #1
0
    def process(self):
        df = pd.read_csv(self.filename,
                         names=self.headers,
                         na_values=["?"],
                         quotechar="'")
        obj_df = df.copy()
        # Process age {numeric}
        obj_df["age"] = obj_df["age"].fillna(0)
        # Process gender {f,m}
        obj_df = pd.get_dummies(obj_df, columns=["gender"], prefix=["is"])

        # Process ethnicity {White-European,Latino,Others,Black,Asian,'Middle Eastern ',Pasifika,'South Asian',Hispanic,Turkish,others}
        obj_df["ethnicity"] = obj_df["ethnicity"].fillna('')
        hee = HashingEncoder(cols=["ethnicity"])
        hee.fit(obj_df)
        obj_df = hee.transform(obj_df)

        # Process jundice {no,yes}
        # Process austim {no,yes}
        # Process used_app_before {no,yes}
        # Class/ASD {NO,YES}
        replace_bool = {
            "jundice": {
                "no": 0,
                "yes": 1
            },
            "austim": {
                "no": 0,
                "yes": 1
            },
            "used_app_before": {
                "no": 0,
                "yes": 1
            },
            "class": {
                "NO": 0,
                "YES": 1
            },
        }
        obj_df.replace(replace_bool, inplace=True)
        # Process contry_of_res {'United States',Brazil,Spain,Egypt,'New Zealand',Bahamas,Burundi,Austria,Argentina,Jordan,Ireland,'United Arab Emirates',Afghanistan,Lebanon,'United Kingdom','South Africa',Italy,Pakistan,Bangladesh,Chile,France,China,Australia,Canada,'Saudi Arabia',Netherlands,Romania,Sweden,Tonga,Oman,India,Philippines,'Sri Lanka','Sierra Leone',Ethiopia,'Viet Nam',Iran,'Costa Rica',Germany,Mexico,Russia,Armenia,Iceland,Nicaragua,'Hong Kong',Japan,Ukraine,Kazakhstan,AmericanSamoa,Uruguay,Serbia,Portugal,Malaysia,Ecuador,Niger,Belgium,Bolivia,Aruba,Finland,Turkey,Nepal,Indonesia,Angola,Azerbaijan,Iraq,'Czech Republic',Cyprus}
        obj_df["contry_of_res"] = obj_df["contry_of_res"].fillna('')
        hec = HashingEncoder(cols=["contry_of_res"])
        hec.fit(obj_df)
        obj_df = hec.transform(obj_df)

        # Process age_desc {'18 and more'}
        obj_df.drop(columns=["age_desc"], inplace=True)

        # Process relation {Self,Parent,'Health care professional',Relative,Others}
        obj_df["relation"] = obj_df["relation"].fillna('')
        lb_relation = LabelEncoder()
        obj_df["relation"] = lb_relation.fit_transform(obj_df["relation"])

        self.processed.data = obj_df.values
        self.processed.target = np.array(obj_df["class"])
        self.processed.target_names = np.array(df["class"].unique())
        return self.processed
예제 #2
0
class _HashingEncoderImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = SkHashingEncoder(**self._hyperparams)

    def fit(self, X, y=None):
        self._wrapped_model.fit(X, y)
        if isinstance(X, pd.DataFrame):
            self._X_columns = X.columns
        return self

    def transform(self, X):
        result = self._wrapped_model.transform(X)
        return result
예제 #3
0
    def hash_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            hash_method: str
                which hashing method to use. Any method from hashlib works.
            max_process: int
                how many processes to use in transform(). Limited in range(1, 64).
                By default, it uses half of the logical CPUs.
                For example, 4C4T makes max_process=2, 4C8T makes max_process=4.
                Set it larger if you have a strong CPU.
                It is not recommended to set it larger than is the count of the
                logical CPUs as it will actually slow down the encoding.
            max_sample: int
                how many samples to encode by each process at a time.
                This setting is useful on low memory machines.
                By default, max_sample=(all samples num)/(max_process).
                For example, 4C8T CPU with 100,000 samples makes max_sample=25,000,
                6C12T CPU with 100,000 samples makes max_sample=16,666.
                It is not recommended to set it larger than the default value.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        max_process = set_default_vale("max_process", configger, 0)
        max_sample = set_default_vale("max_sample", configger, 0)
        n_components = set_default_vale("n_components", configger, 8)
        hash_method = set_default_vale("hash_method", configger, "md5")

        encoder = HashingEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                 max_process=max_process, max_sample=max_sample, n_components=n_components,
                                 hash_method=hash_method)

        res = encoder.fit_transform(X, y)

        return res
예제 #4
0
def train_test_fh():

    # データ読み込み
    df = pd.read_csv('data/dac_sample.txt', sep='\t', header=None)
    df.columns = ground_truth_column + integer_columns + categorical_columns

    df_train, df_test = data_handler.train_test_split(df, test_rate)

    # サンプリング
    # サンプリング後のインデックスが欲しいのでラベル以外はダミーデータを与える
    # 圧倒的に高速
    sampled_indicies = data_handler.under_sampling(
        X=np.zeros((len(df_train), 1), dtype=np.uint8),
        y=df_train[ground_truth_column].values.astype(int))
    df_train = df_train.query('index in @sampled_indicies')

    # NULL値の処理
    df_train = data_handler.fillna_integer_feature(df_train, integer_columns)
    df_train = data_handler.fillna_categorical_feature(df_train,
                                                       categorical_columns)

    # Hashing
    hasher = HashingEncoder(cols=categorical_columns, n_components=n_hash_dims)
    df_train = hasher.fit_transform(df_train)

    # 学習
    X_train = np.array(df_train.drop(ground_truth_column, axis=1).values)
    y_train = np.array(df_train[ground_truth_column].values)
    model = LogisticRegression(random_state=42, solver='lbfgs')
    model.fit(X_train, y_train)

    # テストデータの処理
    df_test = data_handler.fillna_integer_feature(df_test, integer_columns)
    df_test = data_handler.fillna_categorical_feature(df_test,
                                                      categorical_columns)
    df_test = hasher.transform(df_test)

    # 予測
    X_test = np.array(df_test.drop(ground_truth_column, axis=1).values)
    y_test = np.array(df_test[ground_truth_column].values)
    y_proba = model.predict_proba(X_test)

    # 評価
    logloss = evaluator.logloss(y_test, y_proba[:, 1])
    print(logloss)
예제 #5
0
파일: utils.py 프로젝트: pbr142/xai
def default_pipeline(X: pd.DataFrame,
                     high_cardinality_threshold: int = 11,
                     numeric_types=[int, float],
                     categorical_types='object') -> ColumnTransformer:
    """Create a pipeline to process a DataFrame for a scikit-learn model.
    
    * For numeric features, standardization is applied.
    * For low cardinality categorical features, one-hot encoding is applied.
    * For high cardinality categorical features, hashing encoding is applied.

    Args:
        X (pd.DataFrame): DataFrame with features
        high_cardinality_threshold (int, optional): Thresholds for number of categories to distinguish high cardinality and low cardinality categorical features. Defaults to 11.
        numeric_types (list, optional): Types to identify numeric features. Defaults to [int, float].
        categorical_types (str, optional): Types to identify categorical features. Defaults to 'object'.

    Returns:
        ColumnTransformer: [description]
    """

    # define columns
    numeric_columns = X.select_dtypes(numeric_types).columns
    categorical_columns = X.select_dtypes(categorical_types).columns
    idx_high_cardinality = np.array([
        len(X[col].unique()) >= high_cardinality_threshold
        for col in categorical_columns
    ])
    high_cardinality_columns = categorical_columns[idx_high_cardinality]
    low_cardinality_columns = categorical_columns[~idx_high_cardinality]

    # define pipelines
    numeric_pipeline = make_pipeline(StandardScaler())
    low_cardinality_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore'))
    high_cardinality_pipeline = make_pipeline(HashingEncoder(return_df=False))
    feature_pipeline = ColumnTransformer([
        ('numeric', numeric_pipeline, numeric_columns),
        ('low_cardinality', low_cardinality_pipeline, low_cardinality_columns),
        ('high_cardinality', high_cardinality_pipeline,
         high_cardinality_columns)
    ],
                                         remainder='passthrough')

    return feature_pipeline
예제 #6
0
    def encode(self, incoming_data):
        print('\nencode')
        print('incoming_data.shape: {}'.format(incoming_data.shape))

        data = pd.DataFrame(incoming_data)
        result = HashingEncoder.hashing_trick(data, N=self._N,
                                              cols=self._cols).values
        print('result.shape: {}\n'.format(result.shape))
        return result


# def hash_encoding(categorical_or_mvc_data, labels=None, encoder=None):
#     print('\nhash')
# result = np.array([])
# if labels is None and encoder is not None: # predict
#     result = encoder.transform(categorical_or_mvc_data)
# else: #fit
#     encoder = HashingEncoder(cols=list(range(categorical_or_mvc_data.shape[1])), n_components=10)
#     result = encoder.fit_transform(categorical_or_mvc_data, labels)

# print('result.shape: {}\n'.format(result.shape))
# return result
예제 #7
0
    def create_tabular_dataset(self, train, prc=10, verbose=False):
        def pairit(data):
            p1 = pf.pair(data["col_0"], data["col_1"])
            p2 = pf.pair(p1, data["col_2"])
            return pf.pair(p1, p2)

        def code_by_freq(data):
            code = np.where(top_val_count[:, 1] == data['col'])[0][0]
            return code

        ce_hash = HashingEncoder(cols=list(train.columns[12:]),
                                 n_components=3,
                                 verbose=1,
                                 drop_invariant=True,
                                 hash_method='md5')
        tmp_data = ce_hash.fit_transform(train[train.columns[2:]])
        tmp_data["Label"] = train["Label"]
        train = tmp_data

        train["col"] = train.apply(pairit, axis=1)

        bc = np.bincount(train["col"].values.astype(int))
        nz = np.nonzero(bc)[0]
        val_count = np.sort(np.array([list(a) for a in zip(nz, bc[nz])]))
        val_count = val_count[val_count[:, 0].argsort()]
        vc_mean = val_count[:, 0].mean()
        top_val_count = (val_count[val_count[:, 0] > vc_mean][::-1])

        train = train.drop(columns=["col_0", "col_1", "col_2"])
        train = train[train["col"].isin(top_val_count[:, 1][:prc])]

        train["prc"] = train.apply(code_by_freq, axis=1)
        train = train.drop(columns=["col"])
        if verbose:
            print(train["prc"].value_counts())

        tmp = list(train.columns[:-2])
        tmp.extend(["prc", "Label"])
        train = train.reindex(columns=tmp)
        train = train.rename(index=str,
                             columns={
                                 "I2": "f1",
                                 "I3": "f2",
                                 "I4": "f3",
                                 "I5": "f4",
                                 "I6": "f5",
                                 "I7": "f6",
                                 "I8": "f7",
                                 "I9": "f8",
                                 "I11": "f9",
                                 "I13": "f10"
                             })

        if verbose:
            print(train.head())

        train = train.drop(columns=["Label"])
        train = train.reset_index(drop=True)
        train.head()

        X = train[train.columns[:-1]]
        y = train[train.columns[-1]]

        return X, y
예제 #8
0
#OrdinalEncoding for categories which have an order (example: low/medium/high)
map_dict = {'low': 0, 'medium': 1, 'high': 2}
df['var_oe'] = df['var'].apply(lambda x: map_dict[x])
#We can also do it with sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['var_oe'] = le.fit_transform(df['var'])

#BinaryEncoder when we have many categories in one variable it means creating many columns with OHE. With Binary encoding we can do so with many less columns by using binary numbers. Use only when there is a high cardinality in the categorical variable.
from category_encoders.binary import BinaryEncoder
be = BinaryEncoder(cols=['var'])
df = be.fit_transform(df)

#HashingEncoder
from category_encoders.hashing import HashingEncoder
he = HashingEncoder(cols=['var'])
df = he.fit_transform(df)

#Feature selection: Drop attributes that provide no useful information for the task
#Unsupervised Feature selection before training a model
from sklearn.feature_selection import SelectKBest
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(X, Y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df.columns)

featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns

print(featureScores.nlargest(5, 'Score'))
예제 #9
0
    def doPreProcessing(self):
        # Correction of lables

        samples = self.data.copy()

        traffic_labels = samples['Label'].unique()
        traffic_type_labels = samples['Label.1'].unique()

        samples['Label.1'].loc[samples['Label.1'] ==
                               'AUDIO-STREAMING'] = 'Audio-Streaming'
        samples['Label.1'].loc[samples['Label.1'] ==
                               'File-transfer'] = 'File-Transfer'
        samples['Label.1'].loc[samples['Label.1'] ==
                               'Video-streaming'] = 'Video-Streaming'

        traffic_type_labels = samples['Label.1'].unique()

        samples['Label'].loc[(samples['Label'] == 'Non-Tor') |
                             (samples['Label'] == 'NonVPN')] = 'Benign'
        samples['Label'].loc[(samples['Label'] == 'Tor') |
                             (samples['Label'] == 'VPN')] = 'Darknet'

        traffic_type_labels = samples['Label'].unique()

        hours = []
        for timestamp in samples['Timestamp']:
            hour = int(timestamp.split()[1].split(':')[0])
            hours.append(hour)
        samples['hour'] = hours

        ips_grams = {
            'src': {
                'one': [],
                'two': [],
                'three': []
            },
            'dst': {
                'one': [],
                'two': [],
                'three': []
            },
        }

        for src_ip, dst_ip in zip(samples['Src IP'], samples['Dst IP']):
            src_one, src_two, src_three = createGrams(src_ip)
            ips_grams['src']['one'].append(src_one)
            ips_grams['src']['two'].append(src_two)
            ips_grams['src']['three'].append(src_three)

            dst_one, dst_two, dst_three = createGrams(dst_ip)
            ips_grams['dst']['one'].append(dst_one)
            ips_grams['dst']['two'].append(dst_two)
            ips_grams['dst']['three'].append(dst_three)

        samples['src_ip_1gram'] = ips_grams['src']['one']
        samples['src_ip_2gram'] = ips_grams['src']['two']
        samples['src_ip_3gram'] = ips_grams['src']['three']

        samples['dst_ip_1gram'] = ips_grams['dst']['one']
        samples['dst_ip_2gram'] = ips_grams['dst']['two']
        samples['dst_ip_3gram'] = ips_grams['dst']['three']
        print(
            samples[["Src IP", "src_ip_1gram", "src_ip_2gram",
                     "src_ip_3gram"]][200:205])
        print(
            samples[["Dst IP", "dst_ip_1gram", "dst_ip_2gram",
                     "dst_ip_3gram"]][:5])

        ips = np.concatenate(
            (samples['Src IP'].unique(), samples['Dst IP'].unique()))
        cat_ip_info = CatIPInformation("de30fe3213f197", ips)
        ips_dict = cat_ip_info.getIpsDict()

        ips_tuple = zip(samples['Src IP'], samples['Dst IP'])

        dst_ip_country = []
        src_ip_country = []
        src_bogon = []
        dst_bogon = []

        for src_ip, dst_ip in tqdm(ips_tuple, total=len(samples['Src IP'])):
            if 'country' in ips_dict[dst_ip].keys():
                dst_ip_country.append(ips_dict[dst_ip]['country'])
            else:
                dst_ip_country.append('')

            if 'country' in ips_dict[src_ip].keys():
                src_ip_country.append(ips_dict[src_ip]['country'])
            else:
                src_ip_country.append('')

            if 'bogon' in ips_dict[dst_ip].keys():
                dst_bogon.append(ips_dict[dst_ip]['bogon'])
            else:
                dst_bogon.append(False)

            if 'bogon' in ips_dict[src_ip].keys():
                src_bogon.append(ips_dict[src_ip]['bogon'])
            else:
                src_bogon.append(False)

        samples['dst_ip_country'] = dst_ip_country
        samples['src_ip_country'] = src_ip_country
        samples['dst_bogon'] = dst_bogon
        samples['src_bogon'] = src_bogon

        real_columns = [
            'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
            'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
            'Fwd Packet Length Max', 'Fwd Packet Length Min',
            'Fwd Packet Length Mean', 'Fwd Packet Length Std',
            'Bwd Packet Length Max', 'Bwd Packet Length Min',
            'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
            'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
            'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
            'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
            'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
            'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
            'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
            'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max',
            'Packet Length Mean', 'Packet Length Std',
            'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count',
            'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
            'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count',
            'Down/Up Ratio', 'Average Packet Size', 'Fwd Segment Size Avg',
            'Bwd Segment Size Avg', 'Fwd Bytes/Bulk Avg',
            'Fwd Packet/Bulk Avg', 'Fwd Bulk Rate Avg', 'Bwd Bytes/Bulk Avg',
            'Bwd Packet/Bulk Avg', 'Bwd Bulk Rate Avg', 'Subflow Fwd Packets',
            'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes',
            'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'Fwd Act Data Pkts',
            'Fwd Seg Size Min'
        ]
        is_na_cols = samples.columns[samples.isna().sum() > 0]
        print(samples.isna().sum()[is_na_cols])

        samples = samples.dropna()
        print(samples.isna().sum()[is_na_cols])

        samples[real_columns] = samples[real_columns].astype(np.float64)
        samples[real_columns] = samples[real_columns].replace(
            [np.inf, -np.inf], np.nan)
        samples[real_columns] = samples[real_columns].dropna()

        model_samples = samples.copy()

        del model_samples['Flow ID']
        del model_samples['Timestamp']
        del model_samples['Src IP']
        del model_samples['Dst IP']

        cols = np.concatenate(
            (model_samples.columns[81:], model_samples.columns[:81]))
        model_samples = model_samples[cols]

        hash_enc_cols = [
            'src_ip_1gram', 'src_ip_2gram', 'src_ip_3gram', 'dst_ip_1gram',
            'dst_ip_2gram', 'dst_ip_3gram'
        ]
        ord_enc_cols = ['src_ip_country', 'dst_ip_country']

        print("[!] - Encoding Data. May take a while to process")
        hash_enc = HashingEncoder(cols=hash_enc_cols,
                                  n_components=100).fit(model_samples)
        model_samples = hash_enc.transform(model_samples)
        print(model_samples.head())

        ord_enc = OrdinalEncoder()
        ord_enc.fit(model_samples[ord_enc_cols])
        model_samples[ord_enc_cols] = ord_enc.transform(
            model_samples[ord_enc_cols])
        model_samples[ord_enc_cols] = model_samples[ord_enc_cols].astype(int)

        # scaler = StandardScaler().fit(model_samples[real_columns])
        # model_samples[real_columns] = scaler.transform(model_samples[real_columns])
        # print(model_samples[real_columns].head())

        model_samples['src_bogon'] = np.where(model_samples['src_bogon'], 1, 0)
        model_samples['dst_bogon'] = np.where(model_samples['dst_bogon'], 1, 0)

        self.samples = samples.dropna()
        self.model_samples = model_samples.dropna()

        self.model_samples.columns = self.model_samples.columns.str.replace(
            ' ', '_')

        print(samples[samples.columns[samples.isna().sum() > 0]].isna().sum())
예제 #10
0
import datetime
#import xlearn as xl
import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor
import pandas as pd
from category_encoders.hashing import HashingEncoder

df_X = pd.DataFrame([
    1, 2, 3, 4, 1, 2, 4, 5, 8, 7, 66, 2, 24, 5, 4, 1, 2, 111, 1, 31, 3, 23, 13,
    24
],
                    columns=list("A"))

he = HashingEncoder(cols=["A"], return_df=True)
df_X = he.fit_transform(df_X)
print(df_X.head())
예제 #11
0
dfReduced = df.iloc[0:1000, :]
del df
dfReduced.columns = dfReduced.columns.str.strip().str.lower().str.replace(
    ' ', '_').str.replace('(', '').str.replace(')', '')
# Removendo Strings "Infinity" das colunas que contém essa string
#df.drop(df.loc[(df['flow_bytes/s']=="Infinity")| (df['flow_packets/s']=="Infinity")].index,inplace=True)
listOfPositions = getIndexes(pd.DataFrame(dfReduced['flow_bytes/s']),
                             "Infinity")
dfReduced = dfReduced.drop(listOfPositions)
# Removendo missing values
dfReduced = dfReduced.dropna()

dfReduced['destination_port'] = dfReduced['destination_port'].astype(
    'category')
# transformando categorical em numerical
dfReduced['destination_port'] = dfReduced['destination_port'].cat.codes
destPorts = dfReduced['destination_port'].value_counts()

dfReduced['destination_port'] = pd.DataFrame(
    dfReduced['destination_port']).applymap(str)
h = FeatureHasher(n_features=20, input_type="string")
f = h.transform(dfReduced['destination_port'])
a = f.toarray()

X = dfReduced.iloc[:, 0:78]
y = dfReduced.iloc[:, -1]

he = HashingEncoder(cols=["destination_port"]).fit(X, y)
data = he.transform(X)
print(data.info())
예제 #12
0
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')
accuracy score is 0.359550
Precision score:  0.35954967968848134
Recall score:  0.35954967968848134
'''

# Code 2 with hashing encoder method and logistic regression & lgbm
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from category_encoders.hashing import HashingEncoder
X_train1, X_test1, y_train1,y_test1 = train_test_split(train[feature1],train['Stay'],test_size =0.20, shuffle =True)
he = HashingEncoder(cols=['Ward_Type','Type of Admission','Available Extra Rooms in Hospital','Visitors with Patient']).fit(X_train1, y_train)
data = he.transform(X_train1)
data_test = he.transform(X_test1)
print(data.head(20))
'''
#output

        col_0  col_1  col_2  col_3  col_4  col_5  col_6  col_7
225917      1      0      0      0      2      0      1      0
204389      0      0      0      2      1      0      1      0
60523       0      0      0      1      1      1      1      0
32187       0      0      0      1      2      0      1      0
103972      0      0      0      1      2      0      1      0
211224      1      0      0      0      2      0      1      0
88155       0      0      0      3      0      0      1      0
104466      0      0      0      1      2      0      1      0
예제 #13
0
 def __init__(self, **hyperparams):
     self._hyperparams = hyperparams
     self._wrapped_model = SkHashingEncoder(**self._hyperparams)