Example #1
0
from keras.models import *
from keras.optimizers import Adam
from malware_classification import common_process_data as read_data
from keras.layers import Input, LSTM, Bidirectional, Conv2D, Reshape

batch_size = 64
TIME_STEPS = 25
INPUT_DIM = 25
lstm_units = 128
num_classes = 15
epochs = 40

# data pre-processing
# (X_train, y_train), (X_test, y_test) = mnist.load_data('mnist.npz')
(X_train, y_train), (X_test, y_test) = read_data.load_npz_data(
    "F:/数据集/Kim2016/malware_dataset/malware_dataset/attention_train_test_data.npz"
)
X_train = X_train.reshape(-1, 25, 25, 1) / 255.
X_test = X_test.reshape(-1, 25, 25, 1) / 255.
y_train = np_utils.to_categorical(y_train, num_classes=num_classes)
y_test = np_utils.to_categorical(y_test, num_classes=num_classes)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

# build RNN model with attention
inputs = Input(shape=(25, 25, 1))

# build CNN model linked RNN outputs
x = Conv2D(filters=128,
           kernel_size=(5, 5),
           padding='same',
Example #2
0
import time
from malware_classification.Self_Attention import Self_Attention_Layer
from malware_classification import common_process_data as read_data
from malware_classification import global_var as GLVAR
from keras.models import Model
from keras.layers import *

max_features = GLVAR.TOTAL_OPERATIONS_COUNT + 1  # 该数要比operation的个数大1
epochs = 25
batch_size = 32

print('Loading data...')

(x_train,
 y_train), (x_test,
            y_test) = read_data.load_npz_data(GLVAR.TRAIN_AND_TEST_DATA)
X_train = x_train.reshape(-1, GLVAR.pic_pow_size *
                          GLVAR.pic_pow_size)  # why / 255?
X_test = x_test.reshape(-1, GLVAR.pic_pow_size * GLVAR.pic_pow_size)
# 标签转换为独热码
y_train = np_utils.to_categorical(y_train, num_classes=GLVAR.NUM_CLASSES)
y_test = np_utils.to_categorical(y_test, num_classes=GLVAR.NUM_CLASSES)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

# %%数据归一化处理

maxlen = GLVAR.pic_pow_size * GLVAR.pic_pow_size

print('x_train shape:', x_train.shape)
Example #3
0
    print("cnn_x_test length is:", len(cnn_x_test.shape))

cnn_x_train = cnn_x_train.reshape(-1,64,32,1)
cnn_x_test = cnn_x_test.reshape(-1,64,32,1)

print(cnn_x_train.shape)
print(cnn_x_test.shape)

np.savez(GLVAR.MULTY_BINARY_CNN_TRAIN_TEST_DATA, x_train=cnn_x_train, x_test=cnn_x_test, y_train=y_train,y_test=y_test)
'''

max_features = 1000  # 该数要比operation的个数大1

maxlen = 2048

(x_train, y_train), (x_test, y_test) = read_data.load_npz_data(
    GLVAR.MULTY_BINARY_CNN_TRAIN_TEST_DATA)

x_train = x_train.reshape(-1, 2048)
x_test = x_test.reshape(-1, 2048)

S_inputs = Input(shape=(maxlen, ), dtype='float32')

embeddings = Embedding(max_features, 256)(S_inputs)

O_seq = Self_Attention_Layer(256)(embeddings)

O_seq = GlobalAveragePooling1D()(O_seq)

# O_seq = Dropout(0.5)(O_seq)

outputs = Dense(GLVAR.NUM_CLASSES, activation='softmax')(O_seq)
Example #4
0
def train_self_attention(lable_name, epochs, batch_size, score_filename):
    print("================training ", lable_name, ".......================")

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    keras.backend.set_session(sess)

    max_features = GLVAR.TOTAL_OPERATIONS_COUNT + 1  # 该数要比operation的个数大1

    print('Loading data...')

    current_lable_data = GLVAR.MULTY_BINARY_TRAIN_AND_TEST_DATA_DIR + lable_name + '.npz'
    print("Current lable train and test data dir is : %s" %
          (current_lable_data))
    (x_train, y_train), (x_test,
                         y_test) = read_data.load_npz_data(current_lable_data)
    x_train = x_train.reshape(-1, GLVAR.pic_pow_size *
                              GLVAR.pic_pow_size)  # why / 255?
    x_test = x_test.reshape(-1, GLVAR.pic_pow_size * GLVAR.pic_pow_size)
    # 标签转换为独热码
    y_train = np_utils.to_categorical(
        y_train, num_classes=GLVAR.NUM_CLASSES_OF_MULTY_BINARY)
    y_test = np_utils.to_categorical(
        y_test, num_classes=GLVAR.NUM_CLASSES_OF_MULTY_BINARY)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    # %%数据归一化处理

    maxlen = GLVAR.pic_pow_size * GLVAR.pic_pow_size

    print('x_train shape:', x_train.shape)

    print('x_test shape:', x_test.shape)

    S_inputs = Input(shape=(maxlen, ), dtype='int32')

    embeddings = Embedding(maxlen, 256)(S_inputs)

    O_seq = Self_Attention_Layer(256)(embeddings)

    O_seq = GlobalAveragePooling1D()(O_seq)

    O_seq = Dropout(0.5)(O_seq)

    O_seq = Dense(16, activation='softmax')(O_seq)

    outputs = Dense(GLVAR.NUM_CLASSES_OF_MULTY_BINARY,
                    activation='softmax')(O_seq)

    model = Model(inputs=S_inputs, outputs=outputs)

    print(model.summary())

    # try using different optimizers and different optimizer configs
    recall = keras_metrics.binary_recall(label=0)
    # 使用适于二分类的loss函数 binary_crossentropy
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', recall])

    # %%
    print('Training')

    h = model.fit(x_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test))

    model_filename = GLVAR.MULTY_BINARY_SELF_ATTENTION_MODEL_DIR + lable_name + ".h5"
    model.save(model_filename)

    plt.plot(h.history["loss"], label="train_loss")
    plt.plot(h.history["val_loss"], label="val_loss")
    plt.plot(h.history["acc"], label="train_acc")
    plt.plot(h.history["val_acc"], label="val_acc")
    plt.legend()
    plt.show()

    print("-----------------------DY Add------------------------")

    show_train_history(h, 'acc', 'val_acc', epochs)
    show_train_history(h, 'loss', 'val_loss', epochs)

    print('Testing--------------')
    loss, accuracy, recall = model.evaluate(x_test,
                                            y_test,
                                            batch_size=batch_size)

    print('test loss:', loss)
    print('test accuracy:', accuracy)
    print('test recall:', recall)

    print("\t[Info] Accuracy of testing data = {:2.1f}%".format(accuracy *
                                                                100.0))

    score = "----------" + lable_name + "----------\n" + "test loss:" + str(
        format(loss, '.2f')) + "%\ntest accuracy:" + str(
            format(accuracy, '.2f')) + "%\ntest recall:" + str(
                format(recall, '.2f')) + "%\n"

    with open(score_filename, "a") as f:
        f.write(score)
Example #5
0
from matplotlib import pyplot as plt
import time
from malware_classification.Self_Attention import Self_Attention_Layer
from malware_classification import common_process_data as read_data
from malware_classification import global_var as GLVAR
from keras.models import Model
from keras.layers import *


max_features = GLVAR.TOTAL_OPERATIONS_COUNT + 1 # 该数要比operation的个数大1
epochs=30
batch_size = 32

print('Loading data...')

(x_train, y_train), (x_test, y_test) = read_data.load_npz_data(GLVAR.TRAIN_AND_TEST_DATA)
X_train = x_train.reshape(-1,GLVAR.pic_pow_size * GLVAR.pic_pow_size)  # why / 255?
X_test = x_test.reshape(-1,GLVAR.pic_pow_size * GLVAR.pic_pow_size)
# 标签转换为独热码
y_train = np_utils.to_categorical(y_train, num_classes=GLVAR.NUM_CLASSES)
y_test = np_utils.to_categorical(y_test, num_classes=GLVAR.NUM_CLASSES)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

# %%数据归一化处理

maxlen = GLVAR.pic_pow_size * GLVAR.pic_pow_size

print('x_train shape:', x_train.shape)

print('x_test shape:', x_test.shape)