def process(self): df = pd.read_csv(self.filename, names=self.headers, na_values=["?"], quotechar="'") obj_df = df.copy() # Process age {numeric} obj_df["age"] = obj_df["age"].fillna(0) # Process gender {f,m} obj_df = pd.get_dummies(obj_df, columns=["gender"], prefix=["is"]) # Process ethnicity {White-European,Latino,Others,Black,Asian,'Middle Eastern ',Pasifika,'South Asian',Hispanic,Turkish,others} obj_df["ethnicity"] = obj_df["ethnicity"].fillna('') hee = HashingEncoder(cols=["ethnicity"]) hee.fit(obj_df) obj_df = hee.transform(obj_df) # Process jundice {no,yes} # Process austim {no,yes} # Process used_app_before {no,yes} # Class/ASD {NO,YES} replace_bool = { "jundice": { "no": 0, "yes": 1 }, "austim": { "no": 0, "yes": 1 }, "used_app_before": { "no": 0, "yes": 1 }, "class": { "NO": 0, "YES": 1 }, } obj_df.replace(replace_bool, inplace=True) # Process contry_of_res {'United States',Brazil,Spain,Egypt,'New Zealand',Bahamas,Burundi,Austria,Argentina,Jordan,Ireland,'United Arab Emirates',Afghanistan,Lebanon,'United Kingdom','South Africa',Italy,Pakistan,Bangladesh,Chile,France,China,Australia,Canada,'Saudi Arabia',Netherlands,Romania,Sweden,Tonga,Oman,India,Philippines,'Sri Lanka','Sierra Leone',Ethiopia,'Viet Nam',Iran,'Costa Rica',Germany,Mexico,Russia,Armenia,Iceland,Nicaragua,'Hong Kong',Japan,Ukraine,Kazakhstan,AmericanSamoa,Uruguay,Serbia,Portugal,Malaysia,Ecuador,Niger,Belgium,Bolivia,Aruba,Finland,Turkey,Nepal,Indonesia,Angola,Azerbaijan,Iraq,'Czech Republic',Cyprus} obj_df["contry_of_res"] = obj_df["contry_of_res"].fillna('') hec = HashingEncoder(cols=["contry_of_res"]) hec.fit(obj_df) obj_df = hec.transform(obj_df) # Process age_desc {'18 and more'} obj_df.drop(columns=["age_desc"], inplace=True) # Process relation {Self,Parent,'Health care professional',Relative,Others} obj_df["relation"] = obj_df["relation"].fillna('') lb_relation = LabelEncoder() obj_df["relation"] = lb_relation.fit_transform(obj_df["relation"]) self.processed.data = obj_df.values self.processed.target = np.array(obj_df["class"]) self.processed.target_names = np.array(df["class"].unique()) return self.processed
class _HashingEncoderImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = SkHashingEncoder(**self._hyperparams) def fit(self, X, y=None): self._wrapped_model.fit(X, y) if isinstance(X, pd.DataFrame): self._X_columns = X.columns return self def transform(self, X): result = self._wrapped_model.transform(X) return result
def hash_encoder(self, df, configger): """ :param df: the train dataset. :param configger: the json str of configger setting, the params means: verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). hash_method: str which hashing method to use. Any method from hashlib works. max_process: int how many processes to use in transform(). Limited in range(1, 64). By default, it uses half of the logical CPUs. For example, 4C4T makes max_process=2, 4C8T makes max_process=4. Set it larger if you have a strong CPU. It is not recommended to set it larger than is the count of the logical CPUs as it will actually slow down the encoding. max_sample: int how many samples to encode by each process at a time. This setting is useful on low memory machines. By default, max_sample=(all samples num)/(max_process). For example, 4C8T CPU with 100,000 samples makes max_sample=25,000, 6C12T CPU with 100,000 samples makes max_sample=16,666. It is not recommended to set it larger than the default value. :return: the transform result """ X, y, encode_col = self.get_Xy(df, configger) drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True) max_process = set_default_vale("max_process", configger, 0) max_sample = set_default_vale("max_sample", configger, 0) n_components = set_default_vale("n_components", configger, 8) hash_method = set_default_vale("hash_method", configger, "md5") encoder = HashingEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True, max_process=max_process, max_sample=max_sample, n_components=n_components, hash_method=hash_method) res = encoder.fit_transform(X, y) return res
def train_test_fh(): # データ読み込み df = pd.read_csv('data/dac_sample.txt', sep='\t', header=None) df.columns = ground_truth_column + integer_columns + categorical_columns df_train, df_test = data_handler.train_test_split(df, test_rate) # サンプリング # サンプリング後のインデックスが欲しいのでラベル以外はダミーデータを与える # 圧倒的に高速 sampled_indicies = data_handler.under_sampling( X=np.zeros((len(df_train), 1), dtype=np.uint8), y=df_train[ground_truth_column].values.astype(int)) df_train = df_train.query('index in @sampled_indicies') # NULL値の処理 df_train = data_handler.fillna_integer_feature(df_train, integer_columns) df_train = data_handler.fillna_categorical_feature(df_train, categorical_columns) # Hashing hasher = HashingEncoder(cols=categorical_columns, n_components=n_hash_dims) df_train = hasher.fit_transform(df_train) # 学習 X_train = np.array(df_train.drop(ground_truth_column, axis=1).values) y_train = np.array(df_train[ground_truth_column].values) model = LogisticRegression(random_state=42, solver='lbfgs') model.fit(X_train, y_train) # テストデータの処理 df_test = data_handler.fillna_integer_feature(df_test, integer_columns) df_test = data_handler.fillna_categorical_feature(df_test, categorical_columns) df_test = hasher.transform(df_test) # 予測 X_test = np.array(df_test.drop(ground_truth_column, axis=1).values) y_test = np.array(df_test[ground_truth_column].values) y_proba = model.predict_proba(X_test) # 評価 logloss = evaluator.logloss(y_test, y_proba[:, 1]) print(logloss)
def default_pipeline(X: pd.DataFrame, high_cardinality_threshold: int = 11, numeric_types=[int, float], categorical_types='object') -> ColumnTransformer: """Create a pipeline to process a DataFrame for a scikit-learn model. * For numeric features, standardization is applied. * For low cardinality categorical features, one-hot encoding is applied. * For high cardinality categorical features, hashing encoding is applied. Args: X (pd.DataFrame): DataFrame with features high_cardinality_threshold (int, optional): Thresholds for number of categories to distinguish high cardinality and low cardinality categorical features. Defaults to 11. numeric_types (list, optional): Types to identify numeric features. Defaults to [int, float]. categorical_types (str, optional): Types to identify categorical features. Defaults to 'object'. Returns: ColumnTransformer: [description] """ # define columns numeric_columns = X.select_dtypes(numeric_types).columns categorical_columns = X.select_dtypes(categorical_types).columns idx_high_cardinality = np.array([ len(X[col].unique()) >= high_cardinality_threshold for col in categorical_columns ]) high_cardinality_columns = categorical_columns[idx_high_cardinality] low_cardinality_columns = categorical_columns[~idx_high_cardinality] # define pipelines numeric_pipeline = make_pipeline(StandardScaler()) low_cardinality_pipeline = make_pipeline( OneHotEncoder(handle_unknown='ignore')) high_cardinality_pipeline = make_pipeline(HashingEncoder(return_df=False)) feature_pipeline = ColumnTransformer([ ('numeric', numeric_pipeline, numeric_columns), ('low_cardinality', low_cardinality_pipeline, low_cardinality_columns), ('high_cardinality', high_cardinality_pipeline, high_cardinality_columns) ], remainder='passthrough') return feature_pipeline
def encode(self, incoming_data): print('\nencode') print('incoming_data.shape: {}'.format(incoming_data.shape)) data = pd.DataFrame(incoming_data) result = HashingEncoder.hashing_trick(data, N=self._N, cols=self._cols).values print('result.shape: {}\n'.format(result.shape)) return result # def hash_encoding(categorical_or_mvc_data, labels=None, encoder=None): # print('\nhash') # result = np.array([]) # if labels is None and encoder is not None: # predict # result = encoder.transform(categorical_or_mvc_data) # else: #fit # encoder = HashingEncoder(cols=list(range(categorical_or_mvc_data.shape[1])), n_components=10) # result = encoder.fit_transform(categorical_or_mvc_data, labels) # print('result.shape: {}\n'.format(result.shape)) # return result
def create_tabular_dataset(self, train, prc=10, verbose=False): def pairit(data): p1 = pf.pair(data["col_0"], data["col_1"]) p2 = pf.pair(p1, data["col_2"]) return pf.pair(p1, p2) def code_by_freq(data): code = np.where(top_val_count[:, 1] == data['col'])[0][0] return code ce_hash = HashingEncoder(cols=list(train.columns[12:]), n_components=3, verbose=1, drop_invariant=True, hash_method='md5') tmp_data = ce_hash.fit_transform(train[train.columns[2:]]) tmp_data["Label"] = train["Label"] train = tmp_data train["col"] = train.apply(pairit, axis=1) bc = np.bincount(train["col"].values.astype(int)) nz = np.nonzero(bc)[0] val_count = np.sort(np.array([list(a) for a in zip(nz, bc[nz])])) val_count = val_count[val_count[:, 0].argsort()] vc_mean = val_count[:, 0].mean() top_val_count = (val_count[val_count[:, 0] > vc_mean][::-1]) train = train.drop(columns=["col_0", "col_1", "col_2"]) train = train[train["col"].isin(top_val_count[:, 1][:prc])] train["prc"] = train.apply(code_by_freq, axis=1) train = train.drop(columns=["col"]) if verbose: print(train["prc"].value_counts()) tmp = list(train.columns[:-2]) tmp.extend(["prc", "Label"]) train = train.reindex(columns=tmp) train = train.rename(index=str, columns={ "I2": "f1", "I3": "f2", "I4": "f3", "I5": "f4", "I6": "f5", "I7": "f6", "I8": "f7", "I9": "f8", "I11": "f9", "I13": "f10" }) if verbose: print(train.head()) train = train.drop(columns=["Label"]) train = train.reset_index(drop=True) train.head() X = train[train.columns[:-1]] y = train[train.columns[-1]] return X, y
#OrdinalEncoding for categories which have an order (example: low/medium/high) map_dict = {'low': 0, 'medium': 1, 'high': 2} df['var_oe'] = df['var'].apply(lambda x: map_dict[x]) #We can also do it with sklearn's LabelEncoder from sklearn.preprocessing import LabelEncoder le = LabelEncoder() df['var_oe'] = le.fit_transform(df['var']) #BinaryEncoder when we have many categories in one variable it means creating many columns with OHE. With Binary encoding we can do so with many less columns by using binary numbers. Use only when there is a high cardinality in the categorical variable. from category_encoders.binary import BinaryEncoder be = BinaryEncoder(cols=['var']) df = be.fit_transform(df) #HashingEncoder from category_encoders.hashing import HashingEncoder he = HashingEncoder(cols=['var']) df = he.fit_transform(df) #Feature selection: Drop attributes that provide no useful information for the task #Unsupervised Feature selection before training a model from sklearn.feature_selection import SelectKBest bestfeatures = SelectKBest(score_func=chi2, k='all') fit = bestfeatures.fit(X, Y) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(df.columns) featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores.columns = ['Specs', 'Score'] #naming the dataframe columns print(featureScores.nlargest(5, 'Score'))
def doPreProcessing(self): # Correction of lables samples = self.data.copy() traffic_labels = samples['Label'].unique() traffic_type_labels = samples['Label.1'].unique() samples['Label.1'].loc[samples['Label.1'] == 'AUDIO-STREAMING'] = 'Audio-Streaming' samples['Label.1'].loc[samples['Label.1'] == 'File-transfer'] = 'File-Transfer' samples['Label.1'].loc[samples['Label.1'] == 'Video-streaming'] = 'Video-Streaming' traffic_type_labels = samples['Label.1'].unique() samples['Label'].loc[(samples['Label'] == 'Non-Tor') | (samples['Label'] == 'NonVPN')] = 'Benign' samples['Label'].loc[(samples['Label'] == 'Tor') | (samples['Label'] == 'VPN')] = 'Darknet' traffic_type_labels = samples['Label'].unique() hours = [] for timestamp in samples['Timestamp']: hour = int(timestamp.split()[1].split(':')[0]) hours.append(hour) samples['hour'] = hours ips_grams = { 'src': { 'one': [], 'two': [], 'three': [] }, 'dst': { 'one': [], 'two': [], 'three': [] }, } for src_ip, dst_ip in zip(samples['Src IP'], samples['Dst IP']): src_one, src_two, src_three = createGrams(src_ip) ips_grams['src']['one'].append(src_one) ips_grams['src']['two'].append(src_two) ips_grams['src']['three'].append(src_three) dst_one, dst_two, dst_three = createGrams(dst_ip) ips_grams['dst']['one'].append(dst_one) ips_grams['dst']['two'].append(dst_two) ips_grams['dst']['three'].append(dst_three) samples['src_ip_1gram'] = ips_grams['src']['one'] samples['src_ip_2gram'] = ips_grams['src']['two'] samples['src_ip_3gram'] = ips_grams['src']['three'] samples['dst_ip_1gram'] = ips_grams['dst']['one'] samples['dst_ip_2gram'] = ips_grams['dst']['two'] samples['dst_ip_3gram'] = ips_grams['dst']['three'] print( samples[["Src IP", "src_ip_1gram", "src_ip_2gram", "src_ip_3gram"]][200:205]) print( samples[["Dst IP", "dst_ip_1gram", "dst_ip_2gram", "dst_ip_3gram"]][:5]) ips = np.concatenate( (samples['Src IP'].unique(), samples['Dst IP'].unique())) cat_ip_info = CatIPInformation("de30fe3213f197", ips) ips_dict = cat_ip_info.getIpsDict() ips_tuple = zip(samples['Src IP'], samples['Dst IP']) dst_ip_country = [] src_ip_country = [] src_bogon = [] dst_bogon = [] for src_ip, dst_ip in tqdm(ips_tuple, total=len(samples['Src IP'])): if 'country' in ips_dict[dst_ip].keys(): dst_ip_country.append(ips_dict[dst_ip]['country']) else: dst_ip_country.append('') if 'country' in ips_dict[src_ip].keys(): src_ip_country.append(ips_dict[src_ip]['country']) else: src_ip_country.append('') if 'bogon' in ips_dict[dst_ip].keys(): dst_bogon.append(ips_dict[dst_ip]['bogon']) else: dst_bogon.append(False) if 'bogon' in ips_dict[src_ip].keys(): src_bogon.append(ips_dict[src_ip]['bogon']) else: src_bogon.append(False) samples['dst_ip_country'] = dst_ip_country samples['src_ip_country'] = src_ip_country samples['dst_bogon'] = dst_bogon samples['src_bogon'] = src_bogon real_columns = [ 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg', 'Fwd Bytes/Bulk Avg', 'Fwd Packet/Bulk Avg', 'Fwd Bulk Rate Avg', 'Bwd Bytes/Bulk Avg', 'Bwd Packet/Bulk Avg', 'Bwd Bulk Rate Avg', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'Fwd Act Data Pkts', 'Fwd Seg Size Min' ] is_na_cols = samples.columns[samples.isna().sum() > 0] print(samples.isna().sum()[is_na_cols]) samples = samples.dropna() print(samples.isna().sum()[is_na_cols]) samples[real_columns] = samples[real_columns].astype(np.float64) samples[real_columns] = samples[real_columns].replace( [np.inf, -np.inf], np.nan) samples[real_columns] = samples[real_columns].dropna() model_samples = samples.copy() del model_samples['Flow ID'] del model_samples['Timestamp'] del model_samples['Src IP'] del model_samples['Dst IP'] cols = np.concatenate( (model_samples.columns[81:], model_samples.columns[:81])) model_samples = model_samples[cols] hash_enc_cols = [ 'src_ip_1gram', 'src_ip_2gram', 'src_ip_3gram', 'dst_ip_1gram', 'dst_ip_2gram', 'dst_ip_3gram' ] ord_enc_cols = ['src_ip_country', 'dst_ip_country'] print("[!] - Encoding Data. May take a while to process") hash_enc = HashingEncoder(cols=hash_enc_cols, n_components=100).fit(model_samples) model_samples = hash_enc.transform(model_samples) print(model_samples.head()) ord_enc = OrdinalEncoder() ord_enc.fit(model_samples[ord_enc_cols]) model_samples[ord_enc_cols] = ord_enc.transform( model_samples[ord_enc_cols]) model_samples[ord_enc_cols] = model_samples[ord_enc_cols].astype(int) # scaler = StandardScaler().fit(model_samples[real_columns]) # model_samples[real_columns] = scaler.transform(model_samples[real_columns]) # print(model_samples[real_columns].head()) model_samples['src_bogon'] = np.where(model_samples['src_bogon'], 1, 0) model_samples['dst_bogon'] = np.where(model_samples['dst_bogon'], 1, 0) self.samples = samples.dropna() self.model_samples = model_samples.dropna() self.model_samples.columns = self.model_samples.columns.str.replace( ' ', '_') print(samples[samples.columns[samples.isna().sum() > 0]].isna().sum())
import datetime #import xlearn as xl import lightgbm as lgb from lightgbm.sklearn import LGBMRegressor import pandas as pd from category_encoders.hashing import HashingEncoder df_X = pd.DataFrame([ 1, 2, 3, 4, 1, 2, 4, 5, 8, 7, 66, 2, 24, 5, 4, 1, 2, 111, 1, 31, 3, 23, 13, 24 ], columns=list("A")) he = HashingEncoder(cols=["A"], return_df=True) df_X = he.fit_transform(df_X) print(df_X.head())
dfReduced = df.iloc[0:1000, :] del df dfReduced.columns = dfReduced.columns.str.strip().str.lower().str.replace( ' ', '_').str.replace('(', '').str.replace(')', '') # Removendo Strings "Infinity" das colunas que contém essa string #df.drop(df.loc[(df['flow_bytes/s']=="Infinity")| (df['flow_packets/s']=="Infinity")].index,inplace=True) listOfPositions = getIndexes(pd.DataFrame(dfReduced['flow_bytes/s']), "Infinity") dfReduced = dfReduced.drop(listOfPositions) # Removendo missing values dfReduced = dfReduced.dropna() dfReduced['destination_port'] = dfReduced['destination_port'].astype( 'category') # transformando categorical em numerical dfReduced['destination_port'] = dfReduced['destination_port'].cat.codes destPorts = dfReduced['destination_port'].value_counts() dfReduced['destination_port'] = pd.DataFrame( dfReduced['destination_port']).applymap(str) h = FeatureHasher(n_features=20, input_type="string") f = h.transform(dfReduced['destination_port']) a = f.toarray() X = dfReduced.iloc[:, 0:78] y = dfReduced.iloc[:, -1] he = HashingEncoder(cols=["destination_port"]).fit(X, y) data = he.transform(X) print(data.info())
'Severity of Illness', 'Visitors with Patient', 'Age', 'Admission_Deposit', 'Stay'], dtype='object') accuracy score is 0.359550 Precision score: 0.35954967968848134 Recall score: 0.35954967968848134 ''' # Code 2 with hashing encoder method and logistic regression & lgbm import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from category_encoders.hashing import HashingEncoder X_train1, X_test1, y_train1,y_test1 = train_test_split(train[feature1],train['Stay'],test_size =0.20, shuffle =True) he = HashingEncoder(cols=['Ward_Type','Type of Admission','Available Extra Rooms in Hospital','Visitors with Patient']).fit(X_train1, y_train) data = he.transform(X_train1) data_test = he.transform(X_test1) print(data.head(20)) ''' #output col_0 col_1 col_2 col_3 col_4 col_5 col_6 col_7 225917 1 0 0 0 2 0 1 0 204389 0 0 0 2 1 0 1 0 60523 0 0 0 1 1 1 1 0 32187 0 0 0 1 2 0 1 0 103972 0 0 0 1 2 0 1 0 211224 1 0 0 0 2 0 1 0 88155 0 0 0 3 0 0 1 0 104466 0 0 0 1 2 0 1 0
def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = SkHashingEncoder(**self._hyperparams)