filename = os.path.basename(fullname)
        df = utils.load_h5(dir, filename)
        dataframes.append(df)
        num_examples = len(df.values)
    # create one large dataframe
data = pd.concat(dataframes)
data.sample(frac=1, random_state=seed).reset_index(drop=True)
num_rows = data.shape[0]
columns = data.columns
print(columns)

# step 3: get features (x) and scale the features
# get x and convert it to numpy array
# x = da.getbytes(data, 1460)
standard_scaler = StandardScaler()
x = da.getbytes(data, num_headers*54)
x_std = standard_scaler.fit_transform(x)
# step 4: get class labels y and then encode it into number
# get class label data
y = data['label'].values
# encode the class label
class_labels = np.unique(y)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
# step 5: split the data into training set and test set
test_percentage = 0.1
x_tests = []
y_tests = []
x_train, x_test, y_train, y_test = train_test_split(x_std, y, test_size=test_percentage, random_state=seed)
# t-distributed Stochastic Neighbor Embedding (t-SNE) visualization
plot_savename = "t-sne_16headers_windows_linux_perplexity"
num_examples = 0
for dir in dirs:
    for fullname in glob.iglob(dir + '*.h5'):
        filename = os.path.basename(fullname)
        df = utils.load_h5(dir, filename)
        dataframes.append(df)
        num_examples = len(df.values)
    # create one large dataframe
data = pd.concat(dataframes)
data.sample(frac=1, random_state=seed).reset_index(drop=True)
num_rows = data.shape[0]
columns = data.columns
print(columns)

# step 2: get features (x) and convert it to numpy array
x = da.getbytes(data, data_len)

# step 3: get class labels y and then encode it into number
# get class label data
y = data['label'].values

# encode the class label
class_labels = np.unique(y)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# step 4: split the data into training set and test set
test_percentage = 0.5
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_percentage, random_state=seed)

plot_savename = "histogram_payload"