def test_rpst(self, testX=None, testy=None, is_single_class=False):
        self.mode = 'test'
        if testX is None and testy is None:
            _, _, testX = utils.read_joblib(
                config.get('feature.' + self.feature_tp, 'dataX'))
            _, _, testy = utils.read_joblib(
                config.get('feature.' + self.feature_tp, 'datay'))

        if len(testX) == 0:
            print("No test data.")
            return

        # rebuild the graph
        tf.reset_default_graph()
        self.model_graph()

        cur_checkpoint = tf.train.latest_checkpoint(self.save_dir)
        if cur_checkpoint is None:
            print("No saved parameters")
            return
        # load parameters
        saver = tf.train.Saver()
        eval_dir = os.path.join(self.save_dir, 'eval')
        sess = tf.Session()
        with sess:
            saver.restore(sess, cur_checkpoint)
            y_pred = tester(sess, testX, testy, self)

            from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, balanced_accuracy_score
            accuracy = accuracy_score(testy, y_pred)
            b_accuracy = balanced_accuracy_score(testy, y_pred)

            MSG = "The accuracy on the test dataset is {:.5f}%"
            print(MSG.format(accuracy * 100))
            MSG = "The balanced accuracy on the test dataset is {:.5f}%"
            print(MSG.format(b_accuracy * 100))
            if not is_single_class:
                tn, fp, fn, tp = confusion_matrix(testy, y_pred).ravel()

                fpr = fp / float(tn + fp)
                fnr = fn / float(tp + fn)
                f1 = f1_score(testy, y_pred, average='binary')
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag='accuracy', simple_value=accuracy),
                    tf.Summary.Value(tag='f1 score', simple_value=f1)
                ])

                if not os.path.exists(eval_dir):
                    os.mkdir(eval_dir)
                summary_writer = tf.summary.FileWriter(eval_dir)
                summary_writer.add_summary(summary)

                print("Other evaluation metrics we may need:")
                MSG = "False Negative Rate (FNR) is {:.5f}%, False Positive Rate (FPR) is {:.5f}%, F1 score is {:.5f}%"
                print(MSG.format(fnr * 100, fpr * 100, f1 * 100))
            sess.close()
        return accuracy
Beispiel #2
0
    def train(self, trainX = None, trainy = None, valX = None, valy = None):
        """train deep ensemble"""
        if trainX is None or trainy is None or valX is None or valy is None:
            trainX, valX, _ = utils.read_joblib(config.get('feature.' + self.feature_tp, 'dataX'))
            trainy, valy, _ = utils.read_joblib(config.get('feature.' + self.feature_tp, 'datay'))

        train_input = utils.DataProducer(trainX, trainy,self.hp_params.batch_size, n_epochs=self.hp_params.n_epochs)
        val_input = utils.DataProducer(valX, valy, self.hp_params.batch_size*20, name='val')

        # perturb the malware representations
        val_mal_indicator = (valy == 1.)
        val_malX = valX[val_mal_indicator]
        val_maly = valy[val_mal_indicator]

        # attack initialization
        for inner_maximizer in self.inner_maximizers:
            inner_maximizer.init_graph()

        # record information
        global_train_step = tf.train.get_or_create_global_step()
        saver = tf.train.Saver()
        tf.summary.scalar('accuracy_adv_train', self.accuracy)
        tf.summary.scalar('loss_adv_train', self.cross_entropy)
        merged_summaries = tf.summary.merge_all()

        # optimizers
        var_w = [var for var in tf.global_variables() if 'COMB_W' in var.name]
        with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            optimizer = tf.train.AdamOptimizer(self.hp_params.learning_rate).minimize(self.cross_entropy,
                                                                                      global_step=global_train_step)
            optimizer_w = tf.train.AdamOptimizer(self.hp_params.learning_rate)
            updated_w = self.update_w(optimizer_w, var_w)

        tf_cfg = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
        tf_cfg.gpu_options.allow_growth = True
        tf_cfg.gpu_options.per_process_gpu_memory_fraction = 1.
        sess = tf.Session(config=tf_cfg)

        with sess.as_default():
            summary_writer = tf.summary.FileWriter(self.save_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            training_time = 0.0
            train_input.reset_cursor()
            output_steps = 500
            best_avg_score_val = 0.
            for step_idx, X_batch, y_batch in train_input.next_batch():

                train_dict = {
                    self.x_input: X_batch,
                    self.y_input: y_batch,
                    self.is_training: True
                }

                if (step_idx + 1) % output_steps == 0:
                    print('Step {}/{}:{}'.format(step_idx + 1, train_input.steps, datetime.now()))
                    val_res_list = []
                    _adv_malX_list = []
                    val_input.reset_cursor()
                    for _, valX_batch, valy_batch in val_input.next_batch():
                        val_res_batch = sess.run([self.accuracy, self.y_pred],
                                                 feed_dict={self.x_input: valX_batch,
                                                            self.y_input: valy_batch,
                                                            self.is_training: False})
                        val_res_list.append(val_res_batch)

                        val_mal_indicator_batch = (valy_batch == 1.)
                        val_malX_batch = valX_batch[val_mal_indicator_batch]
                        val_maly_batch = valy_batch[val_mal_indicator_batch]
                        _adv_valX_batch = self.perturbations_of_max_attack(val_malX_batch, val_maly_batch, sess)
                        _adv_malX_list.append(_adv_valX_batch)
                    val_res = np.array(val_res_list, dtype=object)
                    _acc = np.mean(val_res[:, 0])
                    _pred_y = np.concatenate(val_res[:, 1])
                    from sklearn.metrics import f1_score
                    _f1_score = f1_score(valy, _pred_y[:valy.shape[0]])

                    _adv_valX = np.vstack(_adv_malX_list)[:val_maly.shape[0]]

                    _adv_acc_val = sess.run(self.accuracy, feed_dict={self.x_input: _adv_valX,
                                                                      self.y_input: val_maly,
                                                                      self.is_training: False})
                    _avg_score = (_f1_score + _adv_acc_val) / 2.
                    print('    validation accuracy {:.5}%'.format(_acc * 100))
                    print('    validation f1 score {:.5}%'.format(_f1_score * 100))
                    print('    validation accuracy on adversarial malware samples {:.5}%'.format(_adv_acc_val * 100))

                    if step_idx != 0:
                        print('    {} samples per second'.format(
                            output_steps * self.hp_params.batch_size / training_time))
                        training_time = 0.

                    summary = sess.run(merged_summaries, feed_dict=train_dict)
                    summary_writer.add_summary(summary, global_train_step.eval(sess))

                    if best_avg_score_val <= _avg_score:
                        best_avg_score_val = _avg_score
                        if not os.path.exists(self.save_dir):
                            os.makedirs(self.save_dir)
                        saver.save(sess, os.path.join(self.save_dir, 'checkpoint'),
                                   global_step=global_train_step)

                start = default_timer()
                sess.run(optimizer, feed_dict=train_dict)
                sess.run(updated_w, feed_dict=train_dict)
                end = default_timer()
                training_time = training_time + end - start
        sess.close()
    def train(self, trainX=None, trainy=None, valX=None, valy=None):
        """train dnn"""
        if trainX is None or trainy is None or valX is None or valy is None:
            # load training dataset and validation dataset
            trainX, valX, _ = utils.read_joblib(
                config.get('feature.' + self.feature_tp, 'dataX'))
            trainy, valy, _ = utils.read_joblib(
                config.get('feature.' + self.feature_tp, 'datay'))

        train_input = utils.DataProducer(trainX,
                                         trainy,
                                         self.hp_params.batch_size,
                                         n_epochs=self.hp_params.n_epochs)
        val_input = utils.DataProducer(valX,
                                       valy,
                                       self.hp_params.batch_size,
                                       name='val')

        global_train_step = tf.train.get_or_create_global_step()
        saver = tf.train.Saver()
        tf.summary.scalar('accuracy', self.accuracy)
        tf.summary.scalar('loss', self.cross_entropy)
        merged_summaries = tf.summary.merge_all()

        # optimizer
        with tf.control_dependencies(tf.get_collection(
                tf.GraphKeys.UPDATE_OPS)):
            optimizer = tf.train.AdamOptimizer(
                self.hp_params.learning_rate).minimize(
                    self.cross_entropy, global_step=global_train_step)
        tf_cfg = tf.ConfigProto(log_device_placement=True,
                                allow_soft_placement=True)
        tf_cfg.gpu_options.allow_growth = True
        tf_cfg.gpu_options.per_process_gpu_memory_fraction = 1.
        sess = tf.Session(config=tf_cfg)

        with sess.as_default():
            summary_writer = tf.summary.FileWriter(self.save_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            training_time = 0.0
            train_input.reset_cursor()
            output_steps = 50
            best_f1_val = 0.
            for step_idx, X_batch, y_batch in train_input.next_batch():
                train_dict = {
                    self.x_input: X_batch,
                    self.y_input: y_batch,
                    self.is_training: True
                }

                if (step_idx + 1) % output_steps == 0:
                    print('Step {}/{}:{}'.format(step_idx + 1,
                                                 train_input.steps,
                                                 datetime.now()))
                    val_input.reset_cursor()
                    val_res_list = [sess.run([self.accuracy, self.y_pred], feed_dict={self.x_input: valX_batch,
                                                                                      self.y_input: valy_batch,
                                                                                      self.is_training: False}) \
                                    for [_, valX_batch, valy_batch] in val_input.next_batch()
                                    ]
                    val_res = np.array(val_res_list, dtype=object)
                    _acc = np.mean(val_res[:, 0])
                    _pred_y = np.concatenate(val_res[:, 1])
                    from sklearn.metrics import f1_score
                    _f1_score = f1_score(valy, _pred_y[:valy.shape[0]])

                    print('    validation accuracy {:.5}%'.format(_acc * 100))
                    print('    validation f1 score {:.5}%'.format(_f1_score *
                                                                  100))

                    if step_idx != 0:
                        print('    {} samples per second'.format(
                            output_steps * self.hp_params.batch_size /
                            training_time))
                        training_time = 0.

                    summary = sess.run(merged_summaries, feed_dict=train_dict)
                    summary_writer.add_summary(summary,
                                               global_train_step.eval(sess))

                    if best_f1_val < _f1_score:
                        best_f1_val = _f1_score
                        if not os.path.exists(self.save_dir):
                            os.makedirs(self.save_dir)
                        saver.save(sess,
                                   os.path.join(self.save_dir, 'checkpoint'),
                                   global_step=global_train_step)

                start = default_timer()
                sess.run(optimizer, feed_dict=train_dict)
                end = default_timer()
                training_time = training_time + end - start
        sess.close()