def truncNgramMLP():
    data_file = "./Data/train/real_train_data.csv"
    label_file = "./Data/train/real_train_label.csv"

    # data_file = "./Data/train/train.csv"
    # label_file = "./Data/train/train_label.csv"

    X = read_data_file(data_file)
    X = pad_sequences(X, maxlen=328, dtype='int32', padding='post', truncating='post')
    y = read_label_file(label_file)

    # print ("Shape of train data(m):\n", X.shape)
    # print ("Data:\n", X[0:5], "\n")
    # print ("Shape of train label:", y.shape)
    # print ("Label:\n", y[0:5], "\n")

    str_X = []
    for i in range(X.shape[0]):
        str_X.append(','.join([str(k) for k in X[i]]))

    df = pd.DataFrame(str_X, index=range(X.shape[0]), columns=['data'])
    # Apply ngram and Tfidf to
    tfidf = TfidfVectorizer(analyzer="word", max_features=5000, ngram_range=(2, 4))

    # print(tfidf)
    X_transformed = tfidf.fit_transform(df.data)
    # test_transformed = tfidf.fit_transform()

    X_train, X_test, y_train, y_test = train_test_split(X_transformed,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    # Success
    print("Training and testing split was successful.")
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    mlp_model = MLP(X_train.shape[1])

    print(mlp_model.summary())

    tensorBoardCallback = TensorBoard(log_dir='./logs/trunc_ngram_mlp', write_graph=X.shape[1])

    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=1e-4, amsgrad=False)
    # optimizer = SGD(lr=0.01, momentum=0.9, decay=1e-6, nesterov=False)

    mlp_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    mlp_model.fit(X_train, y_train, callbacks=[tensorBoardCallback], epochs=20, batch_size=128)

    score, acc = mlp_model.evaluate(X_test, y_test, verbose=2, batch_size=128)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))
Esempio n. 2
0
from evaluate import evaluate_model

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from data import read_data_file
from reduce_skewness import ReduceSkewness
from Encoder import One_Hot_Encoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Read file

train_df = read_data_file('adult.data')
test_df = read_data_file('adult.test')

# Drop the fnlwgt column which is which is useless for the later analysis

train_df = train_df.drop('fnlwgt', axis=1)
test_df = test_df.drop('fnlwgt', axis=1)

# Get list of categorical variable

object_col = train_df.select_dtypes(include=object).columns.tolist()
for col in object_col:
    print(train_df[col].value_counts(dropna=False)/train_df.shape[0],'\n')


# Convert '?' to NANs
def extract_header(in_file_path, out_file_path, skip_flag=True):
    BASE = 0
    DOS_HEADER_LEN = 64
    DOS_STUB_LEN = 14

    DOS_header = []
    PE_header = []
    Sec_header = []

    PE_header_len_list = []
    Sec_header_len_list = []

    raw_data = read_data_file(in_file_path)

    short_index = []
    index = -1
    print('data processing started ...')
    for i in raw_data:
        index += 1
        # Remove data smaller than 97 bytes, not a PE file
        if (skip_flag and len(i) < 97):
            print('Too short data with length: ', len(i), ' index is: ', index,
                  '  .... skipping......')
            short_index.append(index)
            continue
        # 0 - 78 fixed length
        temp_DOS = i[BASE:DOS_HEADER_LEN + DOS_STUB_LEN]
        DOS_header.append(temp_DOS)

        # Locate PE Pointer in DOS header (60, 61, 62, 63)
        PE_pointer = temp_DOS[DOS_HEADER_LEN - 4:DOS_HEADER_LEN]
        # print('PE_pointer ', PE_pointer)
        PE_header_offset = PE_pointer[0] + PE_pointer[1] * 256 + PE_pointer[
            2] * (256**2) + PE_pointer[3] * (256**3)
        # print('PE_header_offset ', PE_header_offset)

        # Locate PE Header section by PE Header offset
        PE_signature = i[PE_header_offset:PE_header_offset + 4]
        PE_file_header = i[PE_header_offset + 4:PE_header_offset + 24]

        # Locate Section Header number in PE file header and calculate section header length
        Sec_num_pointer = PE_file_header[2:4]
        # print(Sec_num_pointer)
        Sec_num = Sec_num_pointer[0] * 256 + Sec_num_pointer[1]
        Sec_header_len = Sec_num * 40

        # Locate Optional header length in PE file header
        Opt_header_hex = PE_file_header[16:18]
        Opt_header_len = Opt_header_hex[0] + Opt_header_hex[1] * 256

        # Calculate Section Header offset by PE header length
        Sec_header_offset = PE_header_offset + 24 + Opt_header_len

        # Get PE optional header by PE_header offset
        PE_opt_header = i[PE_header_offset + 24:Sec_header_offset]

        # Concatenate PE sigature, PE file header and PE optional header
        temp_PE = list()
        temp_PE.extend(PE_signature)
        temp_PE.extend(PE_file_header)
        temp_PE.extend(PE_opt_header)
        # print('temp_PE ', temp_PE)
        PE_header.append(temp_PE)

        PE_header_len_list.append(len(temp_PE))

        temp_Sec_header = i[Sec_header_offset:Sec_header_offset +
                            Sec_header_len]

        # truncate section header, only take the first 40bit
        trunc_sec_header = []
        for j in range(Sec_num):
            trunc_sec_header.extend(temp_Sec_header[j * 40:j * 40 + 12])
        Sec_header.append(trunc_sec_header)

        Sec_header_len_list.append(len(trunc_sec_header))

    print('PE_header_max_len: ', max(PE_header_len_list))
    print('Sec_header_max_len: ', max(Sec_header_len_list))

    return PE_header_len_list, Sec_header_len_list
# %% [markdown]
# ## Plot PE Header length distribution - Test data
# Credit: [@MengdanCode](https://github.com/MengdanCode)
# %%
from data import read_data_file, read_label_file
import numpy as np
from matplotlib import pyplot as plt

data_file = "./Data/train/train.csv"
label_file = "./Data/train/train_label.csv"

X = read_data_file(data_file)
y = read_label_file(label_file)

# %%
X_len = []
for i in X:
    X_len.append(len(i))

print('X_len generated')

X_len = np.array(X_len)

print(X_len.min())
print(X_len.max())

fig_per_hour = plt.figure()
per_hour = fig_per_hour.add_subplot(111)
counts, bins, patches = per_hour.hist(X_len,
                                      bins=100,
                                      normed=False,