def main(_): from tensorflow.examples.tutorials.mnist import input_data tf.reset_default_graph() mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) config = Config() sess = tf.Session() model = Autoencoder(config, sess) model.fit(mnist.train.images) print("[*] Finished Training") return model, mnist
class Detector(DetectorBase): def __init__(self, key, seq_length=10): super(Detector, self).__init__(key, seq_length) self.key = str(key) self.packet_length = 1500 self.mini_batch = 30 self.epochs = 50 self.train_buffer = [] self.exec_buffer = [] self.set_buffer = [] self.max_round = inf self.train_round = 0 self.model = Autoencoder(self.packet_length, seq_length, self.epochs) self.clf = OneClassSVM(kernel='rbf', gamma=0.1, nu=0.05) self.model_path = os.path.join('model_{}'.format(seq_length), self.key) self.stats_path = os.path.join('stats_{}'.format(seq_length), self.key + '.pkl') self.eval_path = os.path.join('evaluation_{}'.format(seq_length), self.key + '.csv') self.loss_path = os.path.join('evaluation_{}'.format(seq_length), self.key + '_loss.csv') if self.model.exist(self.model_path): print('Using existing model: {}'.format(self.key)) self.model.load(self.model_path) if os.path.exists(self.stats_path): print('Using existing stats') self.clf = joblib.load(self.stats_path) def update_buffer(self, seq, mode, info=None): seq = deepcopy(seq) if mode == 'T' and self.train_round <= self.max_round: self.train_buffer.append(seq) if len(self.train_buffer) == self.mini_batch: random.shuffle(self.train_buffer) X = np.array(self.train_buffer) self.train(X) self.train_buffer = [] self.train_round += 1 elif mode == 'E': self.exec_buffer.append(seq) if len(self.exec_buffer) == 1: X = np.array(self.exec_buffer) self.execute(X, info) self.exec_buffer = [] else: X = np.array(seq) X = X.reshape((1, X.shape[0], X.shape[1])) self.eval(X) def train(self, X): if self.train_round < self.max_round: history = self.model.fit(X) with open(self.loss_path, 'a') as f_loss: writer_loss = csv.writer(f_loss) if self.train_round == 0: writer_loss.writerow([history.history['loss'][0]]) writer_loss.writerow([history.history['loss'][-1]]) print('Detector {} saved'.format(self.key)) def eval(self, X): Y = self.model.predict(X) mse = mean_squared_error(X[0], Y[0]) print('Calculating mse of {}: {}'.format(self.key, mse)) self.set_buffer.append(mse) def set_threshold(self): self.clf = OneClassSVM(kernel='rbf', gamma=0.1, nu=0.05) self.clf.fit(np.array(self.set_buffer).reshape(-1, 1)) joblib.dump(self.clf, self.stats_path) def execute(self, X, info=None): start = time.time() Y = self.model.predict(X) dur = time.time() - start with open(self.eval_path, 'a') as f: writer = csv.writer(f) for x, y in zip(X, Y): mse = mean_squared_error(x, y) print('Execute on {}: {}'.format(self.key, mse)) label = self.clf.predict(np.array(mse).reshape(-1, 1)) result = 'Normal' if label == 1 else 'Malicious' if info: writer.writerow([str(mse), result, str(info)]) else: writer.writerow([str(mse), result]) def wrap_up(self, mode): if mode == 'T': self.model.save(self.model_path) elif mode == 'S': self.set_threshold()
# шумы trainNoise = np.random.normal(loc=0.5, scale=0.5, size=trainX.shape) testNoise = np.random.normal(loc=0.5, scale=0.5, size=testX.shape) trainXNoisy = np.clip(trainX + trainNoise, 0, 1) testXNoisy = np.clip(testX + testNoise, 0, 1) print("[INFO] building autoencoder...") opt = Adam(lr=1e-4) autoencoder = Autoencoder().build(IMAGE_HEIGHT, IMAGE_WIDTH, 3) autoencoder.compile(loss="mse", optimizer=opt) H = autoencoder.fit(trainXNoisy, trainX, validation_data=(testXNoisy, testX), epochs=EPOCHS, batch_size=BS) N = np.arange(0, EPOCHS) plt.style.use("ggplot") plt.figure() plt.plot(N, H.history["loss"], label="train_loss") plt.plot(N, H.history["val_loss"], label="val_loss") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch") plt.ylabel("Loss/Accuracy") plt.legend(loc="lower left") plt.savefig(PLOT_PATH) autoencoder.save(MODEL)
class DataLoader: def __init__(self, tokenizer, max_len, use_vae=False, batch_size=64, ae_epochs=20): self._train_set = [] self._dev_set = [] self._test_set = [] self.use_vae = use_vae self.batch_size = batch_size self.ae_latent_dim = max_len # latent dim equal to max len self.ae_epochs = ae_epochs self.train_steps = 0 self.max_len = max_len self.tokenizer = tokenizer self.tcol_info = defaultdict(dict) self.tcol = {} self.label2idx = {} self.token2cnt = defaultdict(int) self.pad = '<pad>' self.unk = '<unk>' self.autoencoder = None def init_autoencoder(self): if self.autoencoder is None: if self.use_vae: self.autoencoder = VariationalAutoencoder( latent_dim=self.ae_latent_dim, epochs=self.ae_epochs, batch_size=self.batch_size) else: self.autoencoder = Autoencoder(latent_dim=self.ae_latent_dim, epochs=self.ae_epochs, batch_size=self.batch_size) self.autoencoder._compile(self.label_size * self.max_len) def save_vocab(self, save_path): with open(save_path, 'wb') as writer: pickle.dump( { 'tcol_info': self.tcol_info, 'tcol': self.tcol, 'label2idx': self.label2idx, 'token2cnt': self.token2cnt }, writer) def load_vocab(self, save_path): with open(save_path, 'rb') as reader: obj = pickle.load(reader) for key, val in obj.items(): setattr(self, key, val) def save_autoencoder(self, save_path): self.autoencoder.autoencoder.save_weights(save_path) def load_autoencoder(self, save_path): self.init_autoencoder() self.autoencoder.autoencoder.load_weights(save_path) def set_train(self, train_path): """set train dataset""" self._train_set = self._read_data(train_path, build_vocab=True) def set_dev(self, dev_path): """set dev dataset""" self._dev_set = self._read_data(dev_path) def set_test(self, test_path): """set test dataset""" self._test_set = self._read_data(test_path) @property def train_set(self): return self._train_set @property def dev_set(self): return self._dev_set @property def test_set(self): return self._test_set @property def label_size(self): return len(self.label2idx) def save_dataset(self, setname, fpath): if setname == 'train': dataset = self.train_set elif setname == 'dev': dataset = self.dev_set elif setname == 'test': dataset = self.test_set else: raise ValueError(f'not support set {setname}') with open(fpath, 'w') as writer: for data in dataset: writer.writelines(json.dumps(data, ensure_ascii=False) + "\n") def load_dataset(self, setname, fpath): if setname not in ['train', 'dev', 'test']: raise ValueError(f'not support set {setname}') dataset = [] with open(fpath, 'r') as reader: for line in reader: dataset.append(json.loads(line.strip())) if setname == 'train': self._train_set = dataset elif setname == 'dev': self._dev_set = dataset elif setname == 'test': self._test_set = dataset def add_tcol_info(self, token, label): """ add TCoL """ if label not in self.tcol_info[token]: self.tcol_info[token][label] = 1 else: self.tcol_info[token][label] += 1 def set_tcol(self): """ set TCoL """ self.tcol[0] = np.array([0] * self.label_size) # pad self.tcol[1] = np.array([0] * self.label_size) # unk self.tcol[0] = np.reshape(self.tcol[0], (1, -1)) self.tcol[1] = np.reshape(self.tcol[1], (1, -1)) for token, label_dict in self.tcol_info.items(): vector = [0] * self.label_size for label_id, cnt in label_dict.items(): vector[label_id] = cnt / self.token2cnt[token] vector = np.array(vector) self.tcol[token] = np.reshape(vector, (1, -1)) def parse_tcol_ids(self, data, build_vocab=False): if self.use_vae: print("batch alignment...") print("previous data size:", len(data)) keep_size = len(data) // self.batch_size data = data[:keep_size * self.batch_size] print("alignment data size:", len(data)) if build_vocab: print("set tcol....") self.set_tcol() print("token size:", len(self.tcol)) print("done to set tcol...") tcol_vectors = [] for obj in data: padded = [0] * (self.max_len - len(obj['token_ids'])) token_ids = obj['token_ids'] + padded tcol_vector = np.concatenate([ self.tcol.get(token, self.tcol[1]) for token in token_ids[:self.max_len] ]) tcol_vector = np.reshape(tcol_vector, (1, -1)) tcol_vectors.append(tcol_vector) print("train vae...") if len(tcol_vectors) > 1: X = np.concatenate(tcol_vectors) else: X = tcol_vectors[0] if build_vocab: self.init_autoencoder() self.autoencoder.fit(X) X = self.autoencoder.encoder.predict(X, batch_size=self.batch_size) # decomposite assert len(X) == len(data) for x, obj in zip(X, data): obj['tcol_ids'] = x.tolist() return data def _read_data(self, fpath, build_vocab=False): data = [] with open(fpath, "r", encoding="utf-8") as reader: for line in reader: obj = json.loads(line) obj['text'] = clean_str(obj['text']) if build_vocab: if obj['label'] not in self.label2idx: self.label2idx[obj['label']] = len(self.label2idx) tokenized = self.tokenizer.encode(obj['text']) token_ids, segment_ids = tokenized.ids, tokenized.segment_ids for token in token_ids: self.token2cnt[token] += 1 self.add_tcol_info(token, self.label2idx[obj['label']]) data.append({ 'token_ids': token_ids, 'segment_ids': segment_ids, 'label_id': self.label2idx[obj['label']] }) data = self.parse_tcol_ids(data, build_vocab=build_vocab) return data
train_ds = dataloader.load_and_patch(files[0], "fit", args.patch_shape, args.n_patches, args.batch_size, args.prefetch, args.num_parallel_calls, shuffle=None, repeat=True) valid_ds = dataloader.load_and_patch(files[1], "fit", args.patch_shape, args.n_patches, args.batch_size, args.prefetch, args.num_parallel_calls, shuffle=None, repeat=True) test_ds, test_gt = dataloader.load_and_patch(test_files, "inf", num_parallel_calls=args.num_parallel_calls, batch_size=8) input_shape = (None, None, 3) model = Autoencoder(input_shape=input_shape, num_filters=num_filters) model = model.build() print(model.summary()) if args.train_continue: model.load_weights(args.weights_path) # Train the model model.compile(optimizer=optimizer, loss="MSE", metrics=['accuracy']) history = model.fit(train_ds, steps_per_epoch=500, epochs=args.n_epochs, validation_data=valid_ds, validation_steps=250, callbacks=callbacks(model_path, test_ds, test_gt), verbose=1)