Ejemplo n.º 1
0
    def process(self):
        df = pd.read_csv(self.filename,
                         names=self.headers,
                         na_values=["?"],
                         quotechar="'")
        obj_df = df.copy()
        # Process age {numeric}
        obj_df["age"] = obj_df["age"].fillna(0)
        # Process gender {f,m}
        obj_df = pd.get_dummies(obj_df, columns=["gender"], prefix=["is"])

        # Process ethnicity {White-European,Latino,Others,Black,Asian,'Middle Eastern ',Pasifika,'South Asian',Hispanic,Turkish,others}
        obj_df["ethnicity"] = obj_df["ethnicity"].fillna('')
        hee = HashingEncoder(cols=["ethnicity"])
        hee.fit(obj_df)
        obj_df = hee.transform(obj_df)

        # Process jundice {no,yes}
        # Process austim {no,yes}
        # Process used_app_before {no,yes}
        # Class/ASD {NO,YES}
        replace_bool = {
            "jundice": {
                "no": 0,
                "yes": 1
            },
            "austim": {
                "no": 0,
                "yes": 1
            },
            "used_app_before": {
                "no": 0,
                "yes": 1
            },
            "class": {
                "NO": 0,
                "YES": 1
            },
        }
        obj_df.replace(replace_bool, inplace=True)
        # Process contry_of_res {'United States',Brazil,Spain,Egypt,'New Zealand',Bahamas,Burundi,Austria,Argentina,Jordan,Ireland,'United Arab Emirates',Afghanistan,Lebanon,'United Kingdom','South Africa',Italy,Pakistan,Bangladesh,Chile,France,China,Australia,Canada,'Saudi Arabia',Netherlands,Romania,Sweden,Tonga,Oman,India,Philippines,'Sri Lanka','Sierra Leone',Ethiopia,'Viet Nam',Iran,'Costa Rica',Germany,Mexico,Russia,Armenia,Iceland,Nicaragua,'Hong Kong',Japan,Ukraine,Kazakhstan,AmericanSamoa,Uruguay,Serbia,Portugal,Malaysia,Ecuador,Niger,Belgium,Bolivia,Aruba,Finland,Turkey,Nepal,Indonesia,Angola,Azerbaijan,Iraq,'Czech Republic',Cyprus}
        obj_df["contry_of_res"] = obj_df["contry_of_res"].fillna('')
        hec = HashingEncoder(cols=["contry_of_res"])
        hec.fit(obj_df)
        obj_df = hec.transform(obj_df)

        # Process age_desc {'18 and more'}
        obj_df.drop(columns=["age_desc"], inplace=True)

        # Process relation {Self,Parent,'Health care professional',Relative,Others}
        obj_df["relation"] = obj_df["relation"].fillna('')
        lb_relation = LabelEncoder()
        obj_df["relation"] = lb_relation.fit_transform(obj_df["relation"])

        self.processed.data = obj_df.values
        self.processed.target = np.array(obj_df["class"])
        self.processed.target_names = np.array(df["class"].unique())
        return self.processed
Ejemplo n.º 2
0
class _HashingEncoderImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = SkHashingEncoder(**self._hyperparams)

    def fit(self, X, y=None):
        self._wrapped_model.fit(X, y)
        if isinstance(X, pd.DataFrame):
            self._X_columns = X.columns
        return self

    def transform(self, X):
        result = self._wrapped_model.transform(X)
        return result
Ejemplo n.º 3
0
def train_test_fh():

    # データ読み込み
    df = pd.read_csv('data/dac_sample.txt', sep='\t', header=None)
    df.columns = ground_truth_column + integer_columns + categorical_columns

    df_train, df_test = data_handler.train_test_split(df, test_rate)

    # サンプリング
    # サンプリング後のインデックスが欲しいのでラベル以外はダミーデータを与える
    # 圧倒的に高速
    sampled_indicies = data_handler.under_sampling(
        X=np.zeros((len(df_train), 1), dtype=np.uint8),
        y=df_train[ground_truth_column].values.astype(int))
    df_train = df_train.query('index in @sampled_indicies')

    # NULL値の処理
    df_train = data_handler.fillna_integer_feature(df_train, integer_columns)
    df_train = data_handler.fillna_categorical_feature(df_train,
                                                       categorical_columns)

    # Hashing
    hasher = HashingEncoder(cols=categorical_columns, n_components=n_hash_dims)
    df_train = hasher.fit_transform(df_train)

    # 学習
    X_train = np.array(df_train.drop(ground_truth_column, axis=1).values)
    y_train = np.array(df_train[ground_truth_column].values)
    model = LogisticRegression(random_state=42, solver='lbfgs')
    model.fit(X_train, y_train)

    # テストデータの処理
    df_test = data_handler.fillna_integer_feature(df_test, integer_columns)
    df_test = data_handler.fillna_categorical_feature(df_test,
                                                      categorical_columns)
    df_test = hasher.transform(df_test)

    # 予測
    X_test = np.array(df_test.drop(ground_truth_column, axis=1).values)
    y_test = np.array(df_test[ground_truth_column].values)
    y_proba = model.predict_proba(X_test)

    # 評価
    logloss = evaluator.logloss(y_test, y_proba[:, 1])
    print(logloss)
Ejemplo n.º 4
0
    def doPreProcessing(self):
        # Correction of lables

        samples = self.data.copy()

        traffic_labels = samples['Label'].unique()
        traffic_type_labels = samples['Label.1'].unique()

        samples['Label.1'].loc[samples['Label.1'] ==
                               'AUDIO-STREAMING'] = 'Audio-Streaming'
        samples['Label.1'].loc[samples['Label.1'] ==
                               'File-transfer'] = 'File-Transfer'
        samples['Label.1'].loc[samples['Label.1'] ==
                               'Video-streaming'] = 'Video-Streaming'

        traffic_type_labels = samples['Label.1'].unique()

        samples['Label'].loc[(samples['Label'] == 'Non-Tor') |
                             (samples['Label'] == 'NonVPN')] = 'Benign'
        samples['Label'].loc[(samples['Label'] == 'Tor') |
                             (samples['Label'] == 'VPN')] = 'Darknet'

        traffic_type_labels = samples['Label'].unique()

        hours = []
        for timestamp in samples['Timestamp']:
            hour = int(timestamp.split()[1].split(':')[0])
            hours.append(hour)
        samples['hour'] = hours

        ips_grams = {
            'src': {
                'one': [],
                'two': [],
                'three': []
            },
            'dst': {
                'one': [],
                'two': [],
                'three': []
            },
        }

        for src_ip, dst_ip in zip(samples['Src IP'], samples['Dst IP']):
            src_one, src_two, src_three = createGrams(src_ip)
            ips_grams['src']['one'].append(src_one)
            ips_grams['src']['two'].append(src_two)
            ips_grams['src']['three'].append(src_three)

            dst_one, dst_two, dst_three = createGrams(dst_ip)
            ips_grams['dst']['one'].append(dst_one)
            ips_grams['dst']['two'].append(dst_two)
            ips_grams['dst']['three'].append(dst_three)

        samples['src_ip_1gram'] = ips_grams['src']['one']
        samples['src_ip_2gram'] = ips_grams['src']['two']
        samples['src_ip_3gram'] = ips_grams['src']['three']

        samples['dst_ip_1gram'] = ips_grams['dst']['one']
        samples['dst_ip_2gram'] = ips_grams['dst']['two']
        samples['dst_ip_3gram'] = ips_grams['dst']['three']
        print(
            samples[["Src IP", "src_ip_1gram", "src_ip_2gram",
                     "src_ip_3gram"]][200:205])
        print(
            samples[["Dst IP", "dst_ip_1gram", "dst_ip_2gram",
                     "dst_ip_3gram"]][:5])

        ips = np.concatenate(
            (samples['Src IP'].unique(), samples['Dst IP'].unique()))
        cat_ip_info = CatIPInformation("de30fe3213f197", ips)
        ips_dict = cat_ip_info.getIpsDict()

        ips_tuple = zip(samples['Src IP'], samples['Dst IP'])

        dst_ip_country = []
        src_ip_country = []
        src_bogon = []
        dst_bogon = []

        for src_ip, dst_ip in tqdm(ips_tuple, total=len(samples['Src IP'])):
            if 'country' in ips_dict[dst_ip].keys():
                dst_ip_country.append(ips_dict[dst_ip]['country'])
            else:
                dst_ip_country.append('')

            if 'country' in ips_dict[src_ip].keys():
                src_ip_country.append(ips_dict[src_ip]['country'])
            else:
                src_ip_country.append('')

            if 'bogon' in ips_dict[dst_ip].keys():
                dst_bogon.append(ips_dict[dst_ip]['bogon'])
            else:
                dst_bogon.append(False)

            if 'bogon' in ips_dict[src_ip].keys():
                src_bogon.append(ips_dict[src_ip]['bogon'])
            else:
                src_bogon.append(False)

        samples['dst_ip_country'] = dst_ip_country
        samples['src_ip_country'] = src_ip_country
        samples['dst_bogon'] = dst_bogon
        samples['src_bogon'] = src_bogon

        real_columns = [
            'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
            'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
            'Fwd Packet Length Max', 'Fwd Packet Length Min',
            'Fwd Packet Length Mean', 'Fwd Packet Length Std',
            'Bwd Packet Length Max', 'Bwd Packet Length Min',
            'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
            'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
            'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
            'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
            'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
            'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
            'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
            'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max',
            'Packet Length Mean', 'Packet Length Std',
            'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count',
            'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
            'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count',
            'Down/Up Ratio', 'Average Packet Size', 'Fwd Segment Size Avg',
            'Bwd Segment Size Avg', 'Fwd Bytes/Bulk Avg',
            'Fwd Packet/Bulk Avg', 'Fwd Bulk Rate Avg', 'Bwd Bytes/Bulk Avg',
            'Bwd Packet/Bulk Avg', 'Bwd Bulk Rate Avg', 'Subflow Fwd Packets',
            'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes',
            'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'Fwd Act Data Pkts',
            'Fwd Seg Size Min'
        ]
        is_na_cols = samples.columns[samples.isna().sum() > 0]
        print(samples.isna().sum()[is_na_cols])

        samples = samples.dropna()
        print(samples.isna().sum()[is_na_cols])

        samples[real_columns] = samples[real_columns].astype(np.float64)
        samples[real_columns] = samples[real_columns].replace(
            [np.inf, -np.inf], np.nan)
        samples[real_columns] = samples[real_columns].dropna()

        model_samples = samples.copy()

        del model_samples['Flow ID']
        del model_samples['Timestamp']
        del model_samples['Src IP']
        del model_samples['Dst IP']

        cols = np.concatenate(
            (model_samples.columns[81:], model_samples.columns[:81]))
        model_samples = model_samples[cols]

        hash_enc_cols = [
            'src_ip_1gram', 'src_ip_2gram', 'src_ip_3gram', 'dst_ip_1gram',
            'dst_ip_2gram', 'dst_ip_3gram'
        ]
        ord_enc_cols = ['src_ip_country', 'dst_ip_country']

        print("[!] - Encoding Data. May take a while to process")
        hash_enc = HashingEncoder(cols=hash_enc_cols,
                                  n_components=100).fit(model_samples)
        model_samples = hash_enc.transform(model_samples)
        print(model_samples.head())

        ord_enc = OrdinalEncoder()
        ord_enc.fit(model_samples[ord_enc_cols])
        model_samples[ord_enc_cols] = ord_enc.transform(
            model_samples[ord_enc_cols])
        model_samples[ord_enc_cols] = model_samples[ord_enc_cols].astype(int)

        # scaler = StandardScaler().fit(model_samples[real_columns])
        # model_samples[real_columns] = scaler.transform(model_samples[real_columns])
        # print(model_samples[real_columns].head())

        model_samples['src_bogon'] = np.where(model_samples['src_bogon'], 1, 0)
        model_samples['dst_bogon'] = np.where(model_samples['dst_bogon'], 1, 0)

        self.samples = samples.dropna()
        self.model_samples = model_samples.dropna()

        self.model_samples.columns = self.model_samples.columns.str.replace(
            ' ', '_')

        print(samples[samples.columns[samples.isna().sum() > 0]].isna().sum())
Ejemplo n.º 5
0
dfReduced = df.iloc[0:1000, :]
del df
dfReduced.columns = dfReduced.columns.str.strip().str.lower().str.replace(
    ' ', '_').str.replace('(', '').str.replace(')', '')
# Removendo Strings "Infinity" das colunas que contém essa string
#df.drop(df.loc[(df['flow_bytes/s']=="Infinity")| (df['flow_packets/s']=="Infinity")].index,inplace=True)
listOfPositions = getIndexes(pd.DataFrame(dfReduced['flow_bytes/s']),
                             "Infinity")
dfReduced = dfReduced.drop(listOfPositions)
# Removendo missing values
dfReduced = dfReduced.dropna()

dfReduced['destination_port'] = dfReduced['destination_port'].astype(
    'category')
# transformando categorical em numerical
dfReduced['destination_port'] = dfReduced['destination_port'].cat.codes
destPorts = dfReduced['destination_port'].value_counts()

dfReduced['destination_port'] = pd.DataFrame(
    dfReduced['destination_port']).applymap(str)
h = FeatureHasher(n_features=20, input_type="string")
f = h.transform(dfReduced['destination_port'])
a = f.toarray()

X = dfReduced.iloc[:, 0:78]
y = dfReduced.iloc[:, -1]

he = HashingEncoder(cols=["destination_port"]).fit(X, y)
data = he.transform(X)
print(data.info())
Ejemplo n.º 6
0
       'Admission_Deposit', 'Stay'],
      dtype='object')
accuracy score is 0.359550
Precision score:  0.35954967968848134
Recall score:  0.35954967968848134
'''

# Code 2 with hashing encoder method and logistic regression & lgbm
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from category_encoders.hashing import HashingEncoder
X_train1, X_test1, y_train1,y_test1 = train_test_split(train[feature1],train['Stay'],test_size =0.20, shuffle =True)
he = HashingEncoder(cols=['Ward_Type','Type of Admission','Available Extra Rooms in Hospital','Visitors with Patient']).fit(X_train1, y_train)
data = he.transform(X_train1)
data_test = he.transform(X_test1)
print(data.head(20))
'''
#output

        col_0  col_1  col_2  col_3  col_4  col_5  col_6  col_7
225917      1      0      0      0      2      0      1      0
204389      0      0      0      2      1      0      1      0
60523       0      0      0      1      1      1      1      0
32187       0      0      0      1      2      0      1      0
103972      0      0      0      1      2      0      1      0
211224      1      0      0      0      2      0      1      0
88155       0      0      0      3      0      0      1      0
104466      0      0      0      1      2      0      1      0
135541      1      0      0      0      1      1      1      0