def test(self, testX=None, testy=None): prist_testX = None adv_testX = None if testX is None and testy is None: _, prist_testX, adv_testX = utils.read_joblib( config.get('dataset', 'dataX')) _, prist_testy, adv_testy = utils.read_joblib( config.get('dataset', 'datay')) testX = np.concatenate((prist_testX, adv_testX)) testy = np.concatenate((prist_testy, adv_testy)) if len(testX) == 0: print("No test data.") return self.mode = 'test' # rebuild the graph tf.reset_default_graph() self.model_graph() cur_checkpoint = tf.train.latest_checkpoint(self.save_dir) if cur_checkpoint is None: print("No saved parameters") return saver = tf.train.Saver() eval_dir = os.path.join(self.save_dir, 'eval') sess = tf.Session() with sess: saver.restore(sess, cur_checkpoint) accuracy, macro_f1_score = tester(sess, testX, testy, self, eval_dir) MSG = "The accuracy on the test dataset is {:.5f}%" print(MSG.format(accuracy * 100)) logger.info(MSG.format(accuracy * 100)) if prist_testX is not None and adv_testX is not None: print("Other evaluation metrics we may need:") prist_acc, prist_f1_socre = tester(sess, prist_testX, prist_testy, self, eval_dir) adv_acc, adv_f1_score = tester(sess, adv_testX, adv_testy, self, eval_dir) harmonic_f1_score = utils.harmonic_mean( prist_f1_socre, adv_f1_score) MSG = "The accuracy on pristine test datasest is {:.5f}% vs. {:.5f}% on adversarial data." print(MSG.format(prist_acc * 100, adv_acc * 100)) logger.info(MSG.format(prist_acc * 100, adv_acc * 100)) MSG = "The macro f1 score on pristine test datasest is {:.5f}% vs. {:.5f}% on adversarial data." print(MSG.format(prist_f1_socre * 100, adv_f1_score * 100)) logger.info( MSG.format(prist_f1_socre * 100, adv_f1_score * 100)) MSG = "Harmonic macro F1 score is {:.5f}%" print(MSG.format(harmonic_f1_score * 100)) logger.info(MSG.format(harmonic_f1_score * 100)) sess.close() return accuracy
def __init__(self, hyper_params=None, reuse=False, is_saving=True, init_graph=True, mode='train', name='JOINT_DEFENSE'): self.is_saving = is_saving self.init_graph = init_graph self.mode = mode if hyper_params is None: hyper_params = ADV_TRAIN_HP self.hp_params = utils.ParamWrapper(hyper_params) self.threshold = None # get_median() # attack initilization if not (os.path.exists(config.get('dataset', 'dataX')) and os.path.exists(config.get('dataset', 'datay')) and os.path.exists(config.get('dataset', 'normalizer'))): dataX, datay = self.data_preprocess() utils.dump_joblib(dataX, config.get('dataset', 'dataX')) utils.dump_joblib(datay, config.get('dataset', 'datay')) self.normalizer = utils.read_joblib(config.get('dataset', 'normalizer')) input_dim = len(self.normalizer.data_min_) self.inner_maximizer = PGDAdam(self, input_dim, self.normalizer, verbose=False, **AUG_PARAM) super(JointDefense, self).__init__(hyper_params, reuse, self.is_saving, self.init_graph, self.mode, name)
def get_median(): if not os.path.exists(config.get('dataset', 'threshold')): trainX, _, _ = utils.read_joblib(config.get('dataset', 'dataX')) threshold = np.median(trainX, axis=0) utils.dumpdata_np(threshold, config.get('dataset', 'threshold')) threshold = utils.readdata_np(config.get('dataset', 'threshold')) return threshold
def __init__(self, hyper_params = None, reuse = False, is_saving = True, init_graph = True, mode = 'train', name = 'DAE_RPST_LEARN_DNN'): self.is_saving = is_saving self.init_graph = init_graph self.mode = mode if hyper_params is None: hyper_params = DAE_TRAIN_HP # initilization if not (os.path.exists(config.get('dataset', 'dataX')) and os.path.exists(config.get('dataset', 'datay')) and os.path.exists(config.get('dataset', 'normalizer'))): dataX, datay = self.data_preprocess() utils.dump_joblib(dataX, config.get('dataset', 'dataX')) utils.dump_joblib(datay, config.get('dataset', 'datay')) self.normalizer = utils.read_joblib(config.get('dataset', 'normalizer')) input_dim = len(self.normalizer.data_min_) self.inner_maximizer = PGDAdam(self, input_dim, self.normalizer, verbose=False, **AUG_PARAM) super(DAE_RPST_DNN, self).__init__(hyper_params, reuse, self.is_saving, self.init_graph, self.mode, name)
def normalize_inverse(X, normalizer=None): try: if normalizer is None: normalizer = utils.read_joblib(config.get('dataset', 'normalizer')) if np.min(X) < 0 and np.max(X) > 1.: warnings.warn("The data is not within the range [0, 1]") except IOError as e: raise IOError("Unable to load normalizer.") return normalizer.inverse_transform(X)
def normalize_data(X, is_fitting=False): """Normalize data using minmaxscalar""" if not os.path.exists(config.get('dataset', 'normalizer')) and is_fitting: minmax_norm = MinMaxScaler() normalizer = minmax_norm.fit(X) utils.dump_joblib( normalizer, config.get('dataset', 'normalizer'), ) normalizer = utils.read_joblib(config.get('dataset', 'normalizer')) x_clipped = np.clip(X, a_min=normalizer.data_min_, a_max=normalizer.data_max_) X_normlized = normalizer.transform(x_clipped) return X_normlized
def __init__(self, hyper_params=None, reuse=False, is_saving=True, init_graph=True, mode='train', name='BASIC_DNN'): super(BasicDNN, self).__init__() self.is_saving = is_saving self.init_graph = init_graph self.reuse = reuse self.model_name = name try: assert mode == 'train' or mode == 'test' except: raise AssertionError("Two modes: 'train' or 'test', not both.") self.mode = mode if hyper_params is not None: self.hp_params_dict = hyper_params self.hp_params = utils.ParamWrapper(hyper_params) else: self.hp_params_dict = DNN_HP self.hp_params = utils.ParamWrapper(DNN_HP) if self.is_saving: self.save_dir = config.get("experiments", self.model_name.lower()) if not (os.path.exists(config.get('dataset', 'dataX')) and os.path.exists(config.get('dataset', 'datay')) and os.path.exists(config.get('dataset', 'normalizer'))): dataX, datay = self.data_preprocess() utils.dump_joblib(dataX, config.get('dataset', 'dataX')) utils.dump_joblib(datay, config.get('dataset', 'datay')) self.normalizer = utils.read_joblib(config.get('dataset', 'normalizer')) # DNN based model self.input_dim = len(self.normalizer.data_min_) self.hidden_layers = self.hp_params.hidden_units self.output_dim = self.hp_params.output_dim tf.set_random_seed(self.hp_params.random_seed) if self.init_graph: self.model_graph(reuse=reuse)
def train(self, trainX=None, trainy=None, is_sampling = True): """train dnn based malware detector""" if trainX is None and trainy is None: trainX, _, _ = utils.read_joblib(config.get('dataset', 'dataX')) trainy, _, _ = utils.read_joblib(config.get('dataset', 'datay')) if is_sampling: trainX, trainy = random_over_sampling(trainX, trainy, ratio=0.3) # train submodel subsequently per mini-batch global_train_step = tf.train.get_or_create_global_step() saver = tf.train.Saver() # optimizers from collections import defaultdict optimizers_dict = defaultdict(list) for sub_m in range(self.hp_params.base_module_count): with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): optimizer_clf = tf.train.AdamOptimizer(self.hp_params.learning_rate).minimize( self.sub_models[sub_m].cross_entropy, global_step=global_train_step) optimizers_dict[sub_m] = [optimizer_clf] tf_cfg = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True) tf_cfg.gpu_options.allow_growth = True tf_cfg.gpu_options.per_process_gpu_memory_fraction = 1. sess = tf.Session(config=tf_cfg) with sess.as_default(): # summary_writer = tf.summary.FileWriter(self.save_dir, sess.graph) sess.run(tf.global_variables_initializer()) training_time = 0.0 output_steps = 200 for epoch_idx in range(self.hp_params.n_epochs): for sub_m in range(self.hp_params.base_module_count): train_idx = range(len(trainX)) random.seed(self.random_seeds[sub_m]) sub_train_idx = random.sample(train_idx, int(len(train_idx) * self.training_sample_ratio)) train_input_supervised = utils.DataProducer(trainX[sub_train_idx], trainy[sub_train_idx], self.hp_params.batch_size, n_epochs=1) train_input_supervised.reset_cursor() for step_idx, X_batch, y_batch in train_input_supervised.next_batch(): train_dict = { self.x_input: X_batch, self.y_input: y_batch, self.is_training: True } start = default_timer() if len(optimizers_dict[sub_m]) == 1: sess.run(optimizers_dict[sub_m][0], feed_dict=train_dict) else: raise ValueError("Optimizer needs to be changed.") end = default_timer() training_time = training_time + end - start iterations = epoch_idx * train_input_supervised.mini_batches + step_idx + 1 if iterations % output_steps == 0: print("Sub model: ", sub_m) print('Epoch {}/{},Step {}/{}:{}'.format(epoch_idx, self.hp_params.n_epochs, step_idx + 1, train_input_supervised.steps, datetime.now())) _acc = sess.run(self.accuracy, feed_dict=train_dict) print(' training accuracy {:.5}%'.format(_acc * 100)) if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) saver.save(sess, os.path.join(self.save_dir, 'checkpoint'), global_step=global_train_step) sess.close()
def train(self, trainX = None, trainy = None, is_sampling = False): """train dnn based malware detector""" if trainX is None and trainy is None: trainX, _, _ = utils.read_joblib(config.get('dataset', 'dataX')) trainy, _, _ = utils.read_joblib(config.get('dataset', 'datay')) if is_sampling: trainX, trainy = random_over_sampling(trainX, trainy, ratio=0.3) train_input_supervised = utils.DataProducer(trainX, trainy, self.hp_params.batch_size, n_epochs=self.hp_params.n_epochs) saver = tf.train.Saver(max_to_keep=10) tf.summary.scalar('accuracy', self.accuracy) tf.summary.scalar('loss', self.cross_entropy) merged_summaries = tf.summary.merge_all() global_train_step = tf.train.get_or_create_global_step() # optimizer with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): optimizer_clf = tf.train.AdamOptimizer(self.hp_params.learning_rate).minimize(self.cross_entropy, global_step=global_train_step) optimizer_dae = tf.train.AdamOptimizer(self.hp_params.learning_rate).minimize(self.mse_dae) tf_cfg = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True) tf_cfg.gpu_options.allow_growth = True tf_cfg.gpu_options.per_process_gpu_memory_fraction = 1. sess = tf.Session(config=tf_cfg) with sess.as_default(): summary_writer = tf.summary.FileWriter(self.save_dir, sess.graph) sess.run(tf.global_variables_initializer()) training_time = 0.0 train_input_supervised.reset_cursor() output_steps = 100 for step_idx, X_batch, y_batch in train_input_supervised.next_batch(): train_dict = { self.x_input: X_batch, self.y_input: y_batch, self.is_training: True } if (step_idx + 1) % output_steps == 0: print('Step {}/{}:{}'.format(step_idx + 1, train_input_supervised.steps, datetime.now())) _acc = sess.run(self.accuracy, feed_dict=train_dict) print("The Accuracy on training batch:{:.5f}%".format(_acc * 100)) if step_idx != 0: print(' {} samples per second'.format( output_steps * self.hp_params.batch_size / training_time)) training_time = 0. summary = sess.run(merged_summaries, feed_dict=train_dict) summary_writer.add_summary(summary, global_train_step.eval(sess)) if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) saver.save(sess, os.path.join(self.save_dir, 'checkpoint'), global_step=global_train_step) start = default_timer() sess.run(optimizer_dae, feed_dict=train_dict) sess.run(optimizer_clf, feed_dict=train_dict) end = default_timer() training_time = training_time + end - start sess.close()