def tester(sess, testX, testy, model, required_info='label'):
    """
    model testing on test dataset
    :param sess: tf.Session
    :param testX: data for testing, type: 2-D float np.ndarry
    :param testy: corresponding ground truth labels, type: 1-D int np.ndarray
    :param model: trained model
    :params required_info: 'label' or 'prob'
    :return: predicted label or probability
    """
    test_input = utils.DataProducer(testX, testy, batch_size=20, name='test')
    if isinstance(testy, np.ndarray):
        test_num = testy.shape[0]
    else:
        test_num = len(testy)

    # check
    if required_info in 'label':
        info = model.y_pred
    elif required_info in 'proba':
        info = model.y_proba
    else:
        raise ValueError("'label' or 'proba' is supported.")

    with sess.as_default():
        pred = []
        for _, x, y in test_input.next_batch():
            test_dict = {
                model.x_input: x,
                model.y_input: y,
                model.is_training: False
            }
            _y_pred = sess.run(info, feed_dict=test_dict)
            pred.append(_y_pred)

        return np.concatenate(pred)[:test_num]
Beispiel #2
0
    def train(self, trainX = None, trainy = None, valX = None, valy = None):
        """train deep ensemble"""
        if trainX is None or trainy is None or valX is None or valy is None:
            trainX, valX, _ = utils.read_joblib(config.get('feature.' + self.feature_tp, 'dataX'))
            trainy, valy, _ = utils.read_joblib(config.get('feature.' + self.feature_tp, 'datay'))

        train_input = utils.DataProducer(trainX, trainy,self.hp_params.batch_size, n_epochs=self.hp_params.n_epochs)
        val_input = utils.DataProducer(valX, valy, self.hp_params.batch_size*20, name='val')

        # perturb the malware representations
        val_mal_indicator = (valy == 1.)
        val_malX = valX[val_mal_indicator]
        val_maly = valy[val_mal_indicator]

        # attack initialization
        for inner_maximizer in self.inner_maximizers:
            inner_maximizer.init_graph()

        # record information
        global_train_step = tf.train.get_or_create_global_step()
        saver = tf.train.Saver()
        tf.summary.scalar('accuracy_adv_train', self.accuracy)
        tf.summary.scalar('loss_adv_train', self.cross_entropy)
        merged_summaries = tf.summary.merge_all()

        # optimizers
        var_w = [var for var in tf.global_variables() if 'COMB_W' in var.name]
        with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            optimizer = tf.train.AdamOptimizer(self.hp_params.learning_rate).minimize(self.cross_entropy,
                                                                                      global_step=global_train_step)
            optimizer_w = tf.train.AdamOptimizer(self.hp_params.learning_rate)
            updated_w = self.update_w(optimizer_w, var_w)

        tf_cfg = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
        tf_cfg.gpu_options.allow_growth = True
        tf_cfg.gpu_options.per_process_gpu_memory_fraction = 1.
        sess = tf.Session(config=tf_cfg)

        with sess.as_default():
            summary_writer = tf.summary.FileWriter(self.save_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            training_time = 0.0
            train_input.reset_cursor()
            output_steps = 500
            best_avg_score_val = 0.
            for step_idx, X_batch, y_batch in train_input.next_batch():

                train_dict = {
                    self.x_input: X_batch,
                    self.y_input: y_batch,
                    self.is_training: True
                }

                if (step_idx + 1) % output_steps == 0:
                    print('Step {}/{}:{}'.format(step_idx + 1, train_input.steps, datetime.now()))
                    val_res_list = []
                    _adv_malX_list = []
                    val_input.reset_cursor()
                    for _, valX_batch, valy_batch in val_input.next_batch():
                        val_res_batch = sess.run([self.accuracy, self.y_pred],
                                                 feed_dict={self.x_input: valX_batch,
                                                            self.y_input: valy_batch,
                                                            self.is_training: False})
                        val_res_list.append(val_res_batch)

                        val_mal_indicator_batch = (valy_batch == 1.)
                        val_malX_batch = valX_batch[val_mal_indicator_batch]
                        val_maly_batch = valy_batch[val_mal_indicator_batch]
                        _adv_valX_batch = self.perturbations_of_max_attack(val_malX_batch, val_maly_batch, sess)
                        _adv_malX_list.append(_adv_valX_batch)
                    val_res = np.array(val_res_list, dtype=object)
                    _acc = np.mean(val_res[:, 0])
                    _pred_y = np.concatenate(val_res[:, 1])
                    from sklearn.metrics import f1_score
                    _f1_score = f1_score(valy, _pred_y[:valy.shape[0]])

                    _adv_valX = np.vstack(_adv_malX_list)[:val_maly.shape[0]]

                    _adv_acc_val = sess.run(self.accuracy, feed_dict={self.x_input: _adv_valX,
                                                                      self.y_input: val_maly,
                                                                      self.is_training: False})
                    _avg_score = (_f1_score + _adv_acc_val) / 2.
                    print('    validation accuracy {:.5}%'.format(_acc * 100))
                    print('    validation f1 score {:.5}%'.format(_f1_score * 100))
                    print('    validation accuracy on adversarial malware samples {:.5}%'.format(_adv_acc_val * 100))

                    if step_idx != 0:
                        print('    {} samples per second'.format(
                            output_steps * self.hp_params.batch_size / training_time))
                        training_time = 0.

                    summary = sess.run(merged_summaries, feed_dict=train_dict)
                    summary_writer.add_summary(summary, global_train_step.eval(sess))

                    if best_avg_score_val <= _avg_score:
                        best_avg_score_val = _avg_score
                        if not os.path.exists(self.save_dir):
                            os.makedirs(self.save_dir)
                        saver.save(sess, os.path.join(self.save_dir, 'checkpoint'),
                                   global_step=global_train_step)

                start = default_timer()
                sess.run(optimizer, feed_dict=train_dict)
                sess.run(updated_w, feed_dict=train_dict)
                end = default_timer()
                training_time = training_time + end - start
        sess.close()
    def train(self, trainX=None, trainy=None, valX=None, valy=None):
        """train dnn"""
        if trainX is None or trainy is None or valX is None or valy is None:
            # load training dataset and validation dataset
            trainX, valX, _ = utils.read_joblib(
                config.get('feature.' + self.feature_tp, 'dataX'))
            trainy, valy, _ = utils.read_joblib(
                config.get('feature.' + self.feature_tp, 'datay'))

        train_input = utils.DataProducer(trainX,
                                         trainy,
                                         self.hp_params.batch_size,
                                         n_epochs=self.hp_params.n_epochs)
        val_input = utils.DataProducer(valX,
                                       valy,
                                       self.hp_params.batch_size,
                                       name='val')

        global_train_step = tf.train.get_or_create_global_step()
        saver = tf.train.Saver()
        tf.summary.scalar('accuracy', self.accuracy)
        tf.summary.scalar('loss', self.cross_entropy)
        merged_summaries = tf.summary.merge_all()

        # optimizer
        with tf.control_dependencies(tf.get_collection(
                tf.GraphKeys.UPDATE_OPS)):
            optimizer = tf.train.AdamOptimizer(
                self.hp_params.learning_rate).minimize(
                    self.cross_entropy, global_step=global_train_step)
        tf_cfg = tf.ConfigProto(log_device_placement=True,
                                allow_soft_placement=True)
        tf_cfg.gpu_options.allow_growth = True
        tf_cfg.gpu_options.per_process_gpu_memory_fraction = 1.
        sess = tf.Session(config=tf_cfg)

        with sess.as_default():
            summary_writer = tf.summary.FileWriter(self.save_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            training_time = 0.0
            train_input.reset_cursor()
            output_steps = 50
            best_f1_val = 0.
            for step_idx, X_batch, y_batch in train_input.next_batch():
                train_dict = {
                    self.x_input: X_batch,
                    self.y_input: y_batch,
                    self.is_training: True
                }

                if (step_idx + 1) % output_steps == 0:
                    print('Step {}/{}:{}'.format(step_idx + 1,
                                                 train_input.steps,
                                                 datetime.now()))
                    val_input.reset_cursor()
                    val_res_list = [sess.run([self.accuracy, self.y_pred], feed_dict={self.x_input: valX_batch,
                                                                                      self.y_input: valy_batch,
                                                                                      self.is_training: False}) \
                                    for [_, valX_batch, valy_batch] in val_input.next_batch()
                                    ]
                    val_res = np.array(val_res_list, dtype=object)
                    _acc = np.mean(val_res[:, 0])
                    _pred_y = np.concatenate(val_res[:, 1])
                    from sklearn.metrics import f1_score
                    _f1_score = f1_score(valy, _pred_y[:valy.shape[0]])

                    print('    validation accuracy {:.5}%'.format(_acc * 100))
                    print('    validation f1 score {:.5}%'.format(_f1_score *
                                                                  100))

                    if step_idx != 0:
                        print('    {} samples per second'.format(
                            output_steps * self.hp_params.batch_size /
                            training_time))
                        training_time = 0.

                    summary = sess.run(merged_summaries, feed_dict=train_dict)
                    summary_writer.add_summary(summary,
                                               global_train_step.eval(sess))

                    if best_f1_val < _f1_score:
                        best_f1_val = _f1_score
                        if not os.path.exists(self.save_dir):
                            os.makedirs(self.save_dir)
                        saver.save(sess,
                                   os.path.join(self.save_dir, 'checkpoint'),
                                   global_step=global_train_step)

                start = default_timer()
                sess.run(optimizer, feed_dict=train_dict)
                end = default_timer()
                training_time = training_time + end - start
        sess.close()