コード例 #1
0
def main(_):
    from tensorflow.examples.tutorials.mnist import input_data
    tf.reset_default_graph()
    
    mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
    config = Config()
    
    sess = tf.Session()
    model = Autoencoder(config, sess)
    model.fit(mnist.train.images)
    
    print("[*] Finished Training")
    return model, mnist
コード例 #2
0
class Detector(DetectorBase):
    def __init__(self, key, seq_length=10):
        super(Detector, self).__init__(key, seq_length)
        self.key = str(key)
        self.packet_length = 1500
        self.mini_batch = 30
        self.epochs = 50
        self.train_buffer = []
        self.exec_buffer = []
        self.set_buffer = []
        self.max_round = inf
        self.train_round = 0
        self.model = Autoencoder(self.packet_length, seq_length, self.epochs)
        self.clf = OneClassSVM(kernel='rbf', gamma=0.1, nu=0.05)

        self.model_path = os.path.join('model_{}'.format(seq_length), self.key)
        self.stats_path = os.path.join('stats_{}'.format(seq_length),
                                       self.key + '.pkl')
        self.eval_path = os.path.join('evaluation_{}'.format(seq_length),
                                      self.key + '.csv')
        self.loss_path = os.path.join('evaluation_{}'.format(seq_length),
                                      self.key + '_loss.csv')
        if self.model.exist(self.model_path):
            print('Using existing model: {}'.format(self.key))
            self.model.load(self.model_path)
        if os.path.exists(self.stats_path):
            print('Using existing stats')
            self.clf = joblib.load(self.stats_path)

    def update_buffer(self, seq, mode, info=None):
        seq = deepcopy(seq)
        if mode == 'T' and self.train_round <= self.max_round:
            self.train_buffer.append(seq)
            if len(self.train_buffer) == self.mini_batch:
                random.shuffle(self.train_buffer)
                X = np.array(self.train_buffer)
                self.train(X)
                self.train_buffer = []
                self.train_round += 1
        elif mode == 'E':
            self.exec_buffer.append(seq)
            if len(self.exec_buffer) == 1:
                X = np.array(self.exec_buffer)
                self.execute(X, info)
                self.exec_buffer = []
        else:
            X = np.array(seq)
            X = X.reshape((1, X.shape[0], X.shape[1]))
            self.eval(X)

    def train(self, X):
        if self.train_round < self.max_round:
            history = self.model.fit(X)
            with open(self.loss_path, 'a') as f_loss:
                writer_loss = csv.writer(f_loss)
                if self.train_round == 0:
                    writer_loss.writerow([history.history['loss'][0]])
                writer_loss.writerow([history.history['loss'][-1]])
            print('Detector {} saved'.format(self.key))

    def eval(self, X):
        Y = self.model.predict(X)
        mse = mean_squared_error(X[0], Y[0])
        print('Calculating mse of {}: {}'.format(self.key, mse))
        self.set_buffer.append(mse)

    def set_threshold(self):
        self.clf = OneClassSVM(kernel='rbf', gamma=0.1, nu=0.05)
        self.clf.fit(np.array(self.set_buffer).reshape(-1, 1))
        joblib.dump(self.clf, self.stats_path)

    def execute(self, X, info=None):
        start = time.time()
        Y = self.model.predict(X)
        dur = time.time() - start
        with open(self.eval_path, 'a') as f:
            writer = csv.writer(f)
            for x, y in zip(X, Y):
                mse = mean_squared_error(x, y)
                print('Execute on {}: {}'.format(self.key, mse))
                label = self.clf.predict(np.array(mse).reshape(-1, 1))
                result = 'Normal' if label == 1 else 'Malicious'
                if info:
                    writer.writerow([str(mse), result, str(info)])
                else:
                    writer.writerow([str(mse), result])

    def wrap_up(self, mode):
        if mode == 'T':
            self.model.save(self.model_path)
        elif mode == 'S':
            self.set_threshold()
コード例 #3
0
ファイル: train.py プロジェクト: Goganych/OiRS
# шумы
trainNoise = np.random.normal(loc=0.5, scale=0.5, size=trainX.shape)
testNoise = np.random.normal(loc=0.5, scale=0.5, size=testX.shape)
trainXNoisy = np.clip(trainX + trainNoise, 0, 1)
testXNoisy = np.clip(testX + testNoise, 0, 1)

print("[INFO] building autoencoder...")
opt = Adam(lr=1e-4)

autoencoder = Autoencoder().build(IMAGE_HEIGHT, IMAGE_WIDTH, 3)
autoencoder.compile(loss="mse", optimizer=opt)

H = autoencoder.fit(trainXNoisy,
                    trainX,
                    validation_data=(testXNoisy, testX),
                    epochs=EPOCHS,
                    batch_size=BS)

N = np.arange(0, EPOCHS)
plt.style.use("ggplot")
plt.figure()
plt.plot(N, H.history["loss"], label="train_loss")
plt.plot(N, H.history["val_loss"], label="val_loss")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")
plt.savefig(PLOT_PATH)

autoencoder.save(MODEL)
コード例 #4
0
class DataLoader:
    def __init__(self,
                 tokenizer,
                 max_len,
                 use_vae=False,
                 batch_size=64,
                 ae_epochs=20):
        self._train_set = []
        self._dev_set = []
        self._test_set = []

        self.use_vae = use_vae
        self.batch_size = batch_size
        self.ae_latent_dim = max_len  # latent dim equal to max len
        self.ae_epochs = ae_epochs
        self.train_steps = 0
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.tcol_info = defaultdict(dict)
        self.tcol = {}
        self.label2idx = {}
        self.token2cnt = defaultdict(int)

        self.pad = '<pad>'
        self.unk = '<unk>'
        self.autoencoder = None

    def init_autoencoder(self):
        if self.autoencoder is None:
            if self.use_vae:
                self.autoencoder = VariationalAutoencoder(
                    latent_dim=self.ae_latent_dim,
                    epochs=self.ae_epochs,
                    batch_size=self.batch_size)
            else:
                self.autoencoder = Autoencoder(latent_dim=self.ae_latent_dim,
                                               epochs=self.ae_epochs,
                                               batch_size=self.batch_size)
            self.autoencoder._compile(self.label_size * self.max_len)

    def save_vocab(self, save_path):
        with open(save_path, 'wb') as writer:
            pickle.dump(
                {
                    'tcol_info': self.tcol_info,
                    'tcol': self.tcol,
                    'label2idx': self.label2idx,
                    'token2cnt': self.token2cnt
                }, writer)

    def load_vocab(self, save_path):
        with open(save_path, 'rb') as reader:
            obj = pickle.load(reader)
            for key, val in obj.items():
                setattr(self, key, val)

    def save_autoencoder(self, save_path):
        self.autoencoder.autoencoder.save_weights(save_path)

    def load_autoencoder(self, save_path):
        self.init_autoencoder()
        self.autoencoder.autoencoder.load_weights(save_path)

    def set_train(self, train_path):
        """set train dataset"""
        self._train_set = self._read_data(train_path, build_vocab=True)

    def set_dev(self, dev_path):
        """set dev dataset"""
        self._dev_set = self._read_data(dev_path)

    def set_test(self, test_path):
        """set test dataset"""
        self._test_set = self._read_data(test_path)

    @property
    def train_set(self):
        return self._train_set

    @property
    def dev_set(self):
        return self._dev_set

    @property
    def test_set(self):
        return self._test_set

    @property
    def label_size(self):
        return len(self.label2idx)

    def save_dataset(self, setname, fpath):
        if setname == 'train':
            dataset = self.train_set
        elif setname == 'dev':
            dataset = self.dev_set
        elif setname == 'test':
            dataset = self.test_set
        else:
            raise ValueError(f'not support set {setname}')
        with open(fpath, 'w') as writer:
            for data in dataset:
                writer.writelines(json.dumps(data, ensure_ascii=False) + "\n")

    def load_dataset(self, setname, fpath):
        if setname not in ['train', 'dev', 'test']:
            raise ValueError(f'not support set {setname}')
        dataset = []
        with open(fpath, 'r') as reader:
            for line in reader:
                dataset.append(json.loads(line.strip()))
        if setname == 'train':
            self._train_set = dataset
        elif setname == 'dev':
            self._dev_set = dataset
        elif setname == 'test':
            self._test_set = dataset

    def add_tcol_info(self, token, label):
        """ add TCoL
        """
        if label not in self.tcol_info[token]:
            self.tcol_info[token][label] = 1
        else:
            self.tcol_info[token][label] += 1

    def set_tcol(self):
        """ set TCoL
        """
        self.tcol[0] = np.array([0] * self.label_size)  # pad
        self.tcol[1] = np.array([0] * self.label_size)  # unk
        self.tcol[0] = np.reshape(self.tcol[0], (1, -1))
        self.tcol[1] = np.reshape(self.tcol[1], (1, -1))
        for token, label_dict in self.tcol_info.items():
            vector = [0] * self.label_size
            for label_id, cnt in label_dict.items():
                vector[label_id] = cnt / self.token2cnt[token]
            vector = np.array(vector)
            self.tcol[token] = np.reshape(vector, (1, -1))

    def parse_tcol_ids(self, data, build_vocab=False):
        if self.use_vae:
            print("batch alignment...")
            print("previous data size:", len(data))
            keep_size = len(data) // self.batch_size
            data = data[:keep_size * self.batch_size]
            print("alignment data size:", len(data))
        if build_vocab:
            print("set tcol....")
            self.set_tcol()
            print("token size:", len(self.tcol))
            print("done to set tcol...")
        tcol_vectors = []
        for obj in data:
            padded = [0] * (self.max_len - len(obj['token_ids']))
            token_ids = obj['token_ids'] + padded
            tcol_vector = np.concatenate([
                self.tcol.get(token, self.tcol[1])
                for token in token_ids[:self.max_len]
            ])
            tcol_vector = np.reshape(tcol_vector, (1, -1))
            tcol_vectors.append(tcol_vector)
        print("train vae...")
        if len(tcol_vectors) > 1:
            X = np.concatenate(tcol_vectors)
        else:
            X = tcol_vectors[0]
        if build_vocab:
            self.init_autoencoder()
            self.autoencoder.fit(X)
        X = self.autoencoder.encoder.predict(X, batch_size=self.batch_size)
        # decomposite
        assert len(X) == len(data)
        for x, obj in zip(X, data):
            obj['tcol_ids'] = x.tolist()
        return data

    def _read_data(self, fpath, build_vocab=False):
        data = []
        with open(fpath, "r", encoding="utf-8") as reader:
            for line in reader:
                obj = json.loads(line)
                obj['text'] = clean_str(obj['text'])
                if build_vocab:
                    if obj['label'] not in self.label2idx:
                        self.label2idx[obj['label']] = len(self.label2idx)
                tokenized = self.tokenizer.encode(obj['text'])
                token_ids, segment_ids = tokenized.ids, tokenized.segment_ids
                for token in token_ids:
                    self.token2cnt[token] += 1
                    self.add_tcol_info(token, self.label2idx[obj['label']])
                data.append({
                    'token_ids': token_ids,
                    'segment_ids': segment_ids,
                    'label_id': self.label2idx[obj['label']]
                })
            data = self.parse_tcol_ids(data, build_vocab=build_vocab)
        return data
コード例 #5
0
    train_ds = dataloader.load_and_patch(files[0], "fit", args.patch_shape, args.n_patches, args.batch_size,
                         args.prefetch, args.num_parallel_calls, shuffle=None, repeat=True)
        
    valid_ds = dataloader.load_and_patch(files[1], "fit", args.patch_shape, args.n_patches, args.batch_size,
                         args.prefetch, args.num_parallel_calls, shuffle=None, repeat=True)

    test_ds, test_gt = dataloader.load_and_patch(test_files, "inf", num_parallel_calls=args.num_parallel_calls, batch_size=8)

    
    input_shape = (None, None, 3)
    
    model = Autoencoder(input_shape=input_shape, num_filters=num_filters)
    model = model.build()

    print(model.summary())

    if args.train_continue:
        model.load_weights(args.weights_path)


    # Train the model
    model.compile(optimizer=optimizer, loss="MSE", metrics=['accuracy'])
    history = model.fit(train_ds,
              steps_per_epoch=500,
              epochs=args.n_epochs,
              validation_data=valid_ds,
              validation_steps=250,
              callbacks=callbacks(model_path, test_ds, test_gt),
              verbose=1)