Beispiel #1
0
 def __init__(self, feature_mp='binary', use_default_feature=True):
     super(DrebinFeatureReverse,
           self).__init__(feature_type_scope_dict.keys()[0], feature_mp,
                          use_default_feature)
     #load feature infomation
     try:
         if self.use_default_feature:
             self.normalizer = utils.read_pickle(
                 cfg.config.get('feature.' + self.feature_type,
                                'normalizer'))
             self.vocab = utils.read_pickle(
                 cfg.config.get('feature.' + self.feature_type,
                                'vocabulary'))
             self.vocab_info = utils.read_pickle(
                 cfg.config.get('feature.' + self.feature_type,
                                'vocab_info'))
         else:  # use surrogate feature meta-information
             self.normalizer = utils.read_pickle(
                 os.path.join(
                     cfg.config.get('experiments', 'surrogate_save_dir'),
                     'normalizer'))
             self.vocab = utils.read_pickle(
                 os.path.join(
                     cfg.config.get('experiments', 'surrogate_save_dir'),
                     'vocabulary'))
             self.vocab_info = utils.read_pickle(
                 os.path.join(
                     cfg.config.get('experiments', 'surrogate_save_dir'),
                     'vocab_info'))
     except Exception as ex:
         logger.error(str(ex))
         raise IOError("Unable to load meta-information of feature.")
    def __init__(self,
                 info_dict=None,
                 hyper_params_dict=None,
                 reuse=False,
                 name='SURROGATE'):
        if info_dict is None:
            print("Information of model should be provided.")
            return
        if hyper_params_dict is None:
            print("Hyper-parameters are needed.")
            return

        self.info_dict = info_dict
        self.info = utils.ParamWrapper(self.info_dict)

        self.hp_dict = hyper_params_dict
        self.hp_params = utils.ParamWrapper(self.hp_dict)

        self.feature_tp = self.info.feature_type
        self.feature_mp = self.info.feature_mapping_type
        self.feature_utility_rate = self.info.feature_utility_rate
        self.dataset_dir = self.info.dataset_dir

        self.name = name

        self.mal_dir = os.path.join(self.dataset_dir,
                                    config.get('dataset', 'malware_dir_name'))
        self.ben_dir = os.path.join(self.dataset_dir,
                                    config.get('dataset', 'benware_dir_name'))

        tmp_save_dir = config.get('experiments', 'surrogate_save_dir')
        if not os.path.exists(tmp_save_dir):
            os.mkdir(tmp_save_dir)

        self.save_dir = tmp_save_dir

        # self._data_preprocess()

        # model necessaries
        self.input_dim = len(
            utils.read_pickle(os.path.join(
                self.save_dir, 'vocabulary')))  # update in the future
        self.hidden_layers = self.hp_params.hidden_units
        self.output_dim = self.hp_params.output_dim

        # self.model_graph()

        super(SurrogateModel, self).__init__(self.info_dict,
                                             self.hp_dict,
                                             reuse,
                                             is_saving=False,
                                             name=self.name)
def normalize_data(X, is_fitting=False, feature_type='drebin'):
    if is_fitting:
        minmax_norm = MinMaxScaler()
        normalizer = minmax_norm.fit(X)
        utils.dump_pickle(normalizer,
                          config.get('feature.' + feature_type, 'normalizer'))
    elif os.path.exists(config.get('feature.' + feature_type,
                                   'normalizer')) and not is_fitting:
        normalizer = utils.read_pickle(
            config.get('feature.' + feature_type, 'normalizer'))
    else:
        raise ValueError("Unable to find the normalizer")
    feat_normlized = normalizer.transform(X)
    return feat_normlized
    def feature_extraction(self, apk_paths, is_ordering=True):
        """
        feature extraction
        @param apk_paths: the list of applications
        @param is_ordering: return the list of features corresponds to the apk_paths
        """
        feature_save_dir = os.path.join("/tmp", "apk_data")

        if os.path.exists(feature_save_dir):
            # delete the files related to features
            shutil.rmtree(feature_save_dir, ignore_errors=True)
            # a loosely checking
            # file_number = len(os.listdir(feature_save_dir))
            # assert file_number == len(apk_paths), "Feature extraction halts: there are feature files in directory '{}', and please remove it if it is not necessary anymore".format(feature_save_dir)

        get_droid_feature(apk_paths,
                          feature_save_dir,
                          feature_type=self.feature_tp)
        feature_mapping = FeatureMapping(feature_save_dir,
                                         feature_type=self.feature_tp)
        if is_ordering:
            feature = feature_mapping.preprocess_feature(
                is_ordering, apk_paths)
        else:
            feature = feature_mapping.preprocess_feature()
        if not os.path.exists(
                config.get('feature.' + self.feature_tp, 'vocabulary')):
            logger.warning("No vocabulary.")
            return np.array([])
        vocab = utils.read_pickle(
            config.get('feature.' + self.feature_tp, 'vocabulary'))

        if self.feature_mp == 'count':
            return feature_mapping.count_feature_mapping_normalized(
                vocab, feature)
        else:
            return feature_mapping.binary_feature_mapping_normalized(
                vocab, feature)
    def feature_extraction(self, apk_paths, inorder=True):
        feat_save_dir = os.path.join("/tmp", "apk_data")
        if os.path.exists(feat_save_dir):
            shutil.rmtree(feat_save_dir)
        get_droid_feature(apk_paths,
                          feat_save_dir,
                          feature_type=self.feature_tp)
        feature_mapping = FeatureMapping(feat_save_dir,
                                         feature_type=self.feature_tp)
        if inorder:
            feature = feature_mapping.preprocess_feature(inorder, apk_paths)
        else:
            feature = feature_mapping.preprocess_feature()
        if not os.path.exists(os.path.join(self.save_dir, 'vocabulary')):
            logger.info("No vocabulary.")
            return np.array([])
        vocab = utils.read_pickle(os.path.join(self.save_dir, 'vocabulary'))

        if self.feature_mp == 'count':
            return feature_mapping.count_feature_mapping_normalized(
                vocab, feature)
        else:
            return feature_mapping.binary_feature_mapping_normalized(
                vocab, feature)
    def generate_exc_malware_sample(self,
                                    perturbations=None,
                                    adv_save_dir=None):
        """Modify the apk based on the numeral perturbations"""
        assert isinstance(perturbations, np.ndarray)
        assert perturbations.shape[0] % len(self.attack_path_list) == 0

        # Sample might have several perturbation vectors
        apk_paths = self.attack_path_list * (perturbations.shape[0] //
                                             len(self.attack_path_list))
        mod_instr = self.feature_reverser.generate_mod_instruction(
            apk_paths, perturbations)

        modify_sample(mod_instr, adv_save_dir, proc_number=4, vb=False)

        if self.check:
            """
            We check the perturbed APKs by comparing their feature representation to the perturbed representation
            """
            adv_save_paths = []
            for apk in self.attack_path_list:
                adv_save_paths.append(
                    os.path.join(adv_save_dir,
                                 name_adv_file(apk) + '.apk'))

            adv_features = self.targeted_model.feature_extraction(
                adv_save_paths)
            pris_data_path = os.path.join(
                cfg.config.get('attack', self.attack_method_name),
                "pristine_{}.data".format(
                    method_params_dict[self.attack_method_name].get('ord',
                                                                    '')))
            if os.path.exists(pris_data_path):
                pris_feature_vectors = utils.readdata_np(pris_data_path)
            else:
                raise ValueError("No pristine data.")

            if len(adv_features) != len(pris_feature_vectors):
                logger.warning(
                    "Expect the same number of adversarial and pristine feature vectors ({} vs. {})"
                    .format(len(adv_features), len(pris_feature_vectors)))
                return None, perturbations

            if self.feature_reverser.normalizer is not None:
                _perturbations = np.rint(utils.normalize_inverse(adv_features, self.feature_reverser.normalizer)) - \
                                 np.rint(utils.normalize_inverse(pris_feature_vectors, self.feature_reverser.normalizer))
            else:
                _perturbations = adv_features - pris_feature_vectors

            if not np.all(np.abs(_perturbations - perturbations) <= 5e-1):
                logger.warning(
                    "Unable to perturb some components exactly as generated perturbations."
                )
                unequal_pos = (abs(_perturbations - perturbations) > 1e-6)
                vocab = utils.read_pickle(
                    cfg.config.get('feature.' + self.targeted_model.feature_tp,
                                   'vocabulary'))
                for i in range(len(unequal_pos)):
                    if np.any(unequal_pos[i]):
                        MSG_INFO = "Failed to perturb some features:"
                        MSG_FILE = 'File name: {} with index {}'.format(
                            apk_paths[i], i)
                        MSG_res = 'Required perturbations {} vs. Resulting perturbations {} corresponds to elements:{}'
                        MSG = MSG_INFO + '\n' + MSG_FILE + '\n' + \
                              MSG_res.format(perturbations[i, unequal_pos[i]],
                                             _perturbations[i, unequal_pos[i]],
                                             np.array(vocab)[unequal_pos[i]])
                        logger.warning(MSG)
            else:
                logger.info(
                    "Perturbed APKs follow the generated perturbations exactly."
                )
            return adv_features, perturbations
        else:
            return None, perturbations
Beispiel #7
0
    def __init__(self,
                 info_dict = None,
                 hyper_params = None,
                 reuse=False,
                 is_saving = True,
                 init_graph = True,
                 mode = 'train',
                 name = 'DADV_NN_ENSEMBLE_MAX'):
        """
        hardened deep ensemble incorporated with ''max'' attack and a diversifying method
        @param info_dict: None,
        @param hyper_params: hyper parameters,
        @param reuse: reuse the variables or not
        @param is_saving: option for saving weights
        @param init_graph: initialize graph
        @param mode: enable a mode for run the model, 'train' or 'test'
        @param name: model name
        """
        self.is_saving = is_saving
        self.init_graph = init_graph
        self.mode = mode
        if info_dict is None:
            ADV_ENS_INFO.update(INFO)
            info_dict = ADV_ENS_INFO
        self.clf_info = utils.ParamWrapper(info_dict)
        if hyper_params is None:
            ADV_ENS_HP.update(MAX_ADV_TRAIN_HP)
            ADV_ENS_HP.update(DNN_HP)
            hyper_params = ADV_ENS_HP
        self.hp_params = utils.ParamWrapper(hyper_params)
        self.model_name = name

        self.base_model_method = [AdversarialTrainingDNN] * len(MAXIMIZER_METHOD_DICT)
        self.base_model_method.append(BasicDNNModel)
        self.base_model_count = len(self.base_model_method)
        assert self.base_model_count > 1, 'one base model at least'

        # initialization
        if self.clf_info.feature_tp == feature_type_scope_dict.keys()[0]:
            self.normalizer = utils.read_pickle(config.get('feature.' + self.clf_info.feature_tp, 'normalizer'))
        else:
            raise ValueError("Feature type is incompatible.")
        input_dim = len(utils.read_pickle(config.get('feature.' + self.clf_info.feature_tp, 'vocabulary')))
        self.eta = self.hp_params.eta
        feature_reverser = DrebinFeatureReverse()
        allow_insert_array, allow_removal_array = feature_reverser.get_mod_array()

        # build attack graph
        maximizer_name_list = self.hp_params.maximizer_name_list
        self.inner_maximizers = []
        self.trial_list = []
        for maximizer_name in maximizer_name_list:
            maximizer_method = MAXIMIZER_METHOD_DICT[maximizer_name]
            maximizer_param = MAXIMIZER_PARAM_DICT[maximizer_name]
            inner_maximizer = maximizer_method(self,
                                               input_dim,
                                               allow_insert_array,
                                               allow_removal_array,
                                               self.normalizer,
                                               verbose=False,
                                               **maximizer_param
                                               )

            self.inner_maximizers.append(inner_maximizer)
            self.trial_list.append(self.hp_params.trials_dict[maximizer_name])

        # record the number of malware examples in a training batch
        self.batch_size_mal = tf.Variable(0, dtype=tf.int64, trainable=False)

        super(DAdversarialDeepEnsembleMax, self).__init__(info_dict,
                                                          hyper_params,
                                                          reuse = reuse,
                                                          is_saving=self.is_saving,
                                                          init_graph= self.init_graph,
                                                          mode = self.mode,
                                                          name = name)
    def __init__(self,
                 info_dict=None,
                 hyper_params=None,
                 reuse=False,
                 is_saving=True,
                 init_graph=True,
                 mode='train',
                 name='BASIC_DNN'):
        """
        build basic dnn model
        @param info_dict: None,
        @param hyper_params: hyper parameters,
        @param reuse: reuse the variables or not
        @param is_saving: option for saving weights
        @param init_graph: initialize graph
        @param mode: enable a mode for run the model, 'train' or 'test'
        @param name: model name
        """
        super(BasicDNNModel, self).__init__()
        # model setup
        self.is_saving = is_saving
        self.init_graph = init_graph
        try:
            assert mode == 'train' or mode == 'test'
        except:
            raise AssertionError("'train' or 'test' mode, not others.")

        self.mode = mode
        if info_dict is not None:
            self.info_dict = info_dict
        else:
            self.info_dict = INFO
        self.info = utils.ParamWrapper(self.info_dict)
        if hyper_params is not None:
            self.hp_params_dict = hyper_params
        else:
            self.hp_params_dict = DNN_HP
        self.hp_params = utils.ParamWrapper(self.hp_params_dict)
        self.model_name = name

        if self.is_saving:
            self.save_dir = config.get('experiments', name.lower())

        # feature extraction
        self.feature_tp = self.info.feature_type  # drebin
        self.feature_mp = self.info.feature_mapping_type  # binary
        self.dataset_dir = self.info.dataset_dir

        self.mal_dir = os.path.join(self.dataset_dir,
                                    config.get('dataset', 'malware_dir_name'))
        self.ben_dir = os.path.join(self.dataset_dir,
                                    config.get('dataset', 'benware_dir_name'))

        if not (os.path.exists(
                config.get('feature.' + self.feature_tp, 'dataX'))
                and os.path.exists(
                    config.get('feature.' + self.feature_tp, 'datay'))
                and os.path.exists(
                    config.get('feature.' + self.feature_tp, 'vocabulary'))
                and os.path.exists(
                    config.get('feature.' + self.feature_tp, 'normalizer'))
                and os.path.exists(config.get('dataset', 'name_list'))):
            self._data_preprocess()

        # obtain some hyper-parameters
        self.input_dim = len(
            utils.read_pickle(
                config.get('feature.' + self.feature_tp, 'vocabulary')))
        self.hidden_layers = self.hp_params.hidden_units
        self.output_dim = self.hp_params.output_dim
        tf.set_random_seed(self.hp_params.random_seed)
        if self.init_graph:
            self.model_graph(reuse=reuse)
Beispiel #9
0
    def __init__(self,
                 info_dict=None,
                 hyper_params=None,
                 reuse=False,
                 is_saving=True,
                 init_graph=True,
                 mode='train',
                 name='ADV_TRAINING_DNN_MAX'):
        """
        hardened model incorporated with ''max'' attack
        @param info_dict: None,
        @param hyper_params: hyper parameters,
        @param reuse: reuse the variables or not
        @param is_saving: option for saving weights
        @param init_graph: initialize graph
        @param mode: enable a mode for run the model, 'train' or 'test'
        @param name: model name
        """
        self.is_saving = is_saving
        self.init_graph = init_graph
        self.mode = mode
        if info_dict is None:
            info_dict = INFO
        self.info = utils.ParamWrapper(
            info_dict
        )  # get useful information, this will be over-wrote in father class
        self.feature_tp = self.info.feature_type

        if hyper_params is None:
            MAX_ADV_TRAIN_HP.update(DNN_HP)
            hyper_params = MAX_ADV_TRAIN_HP
        self.hp_params = utils.ParamWrapper(hyper_params)

        # initialization
        if self.feature_tp == feature_type_scope_dict.keys()[0]:
            self.normalizer = utils.read_pickle(
                config.get('feature.' + self.feature_tp, 'normalizer'))
        else:
            raise ValueError("Feature type is incompatible.")

        input_dim = len(
            utils.read_pickle(
                config.get('feature.' + self.feature_tp, 'vocabulary')))
        self.eta = self.hp_params.eta
        feature_reverser = DrebinFeatureReverse()
        allow_insert_array, allow_removal_array = feature_reverser.get_mod_array(
        )

        # build attack graph
        maximizer_name_list = self.hp_params.maximizer_name_list
        self.inner_maximizers = []
        self.trial_list = []
        for maximizer_name in maximizer_name_list:
            maximizer_method = MAXIMIZER_METHOD_DICT[maximizer_name]
            maximizer_param = MAXIMIZER_PARAM_DICT[maximizer_name]
            inner_maximizer = maximizer_method(self,
                                               input_dim,
                                               allow_insert_array,
                                               allow_removal_array,
                                               self.normalizer,
                                               verbose=False,
                                               **maximizer_param)

            self.inner_maximizers.append(inner_maximizer)
            self.trial_list.append(self.hp_params.trials_dict[maximizer_name])

        # record the number of malware examples in a training batch
        self.batch_size_mal = tf.Variable(0, dtype=tf.int64, trainable=False)
        super(AdversarialTrainingDNNMax,
              self).__init__(info_dict, hyper_params, reuse, self.is_saving,
                             self.init_graph, self.mode, name)
Beispiel #10
0
    def __init__(self,
                 info_dict=None,
                 hyper_params=None,
                 reuse=False,
                 is_saving=True,
                 init_graph=True,
                 mode='train',
                 name='ADV_TRAINING_DNN'):
        """
        hardened model incorporated with adversarial training
        @param info_dict: None,
        @param hyper_params: hyper parameters,
        @param reuse: reuse the variables or not
        @param is_saving: option for saving weights
        @param init_graph: initialize graph
        @param mode: enable a mode for run the model, 'train' or 'test'
        @param name: model name
        """
        self.is_saving = is_saving
        self.init_graph = init_graph
        self.mode = mode
        if info_dict is None:
            info_dict = INFO
        # get useful information, this will be over-wrote in other class
        self.info = utils.ParamWrapper(info_dict)
        self.feature_tp = self.info.feature_type

        if hyper_params is None:
            ADV_TRAIN_HP.update(DNN_HP)
            hyper_params = ADV_TRAIN_HP  # hyper_params contains information of using which attack
        self.hp_params = utils.ParamWrapper(hyper_params)

        # initialization
        if self.feature_tp == feature_type_scope_dict.keys()[0]:
            self.normalizer = utils.read_pickle(
                config.get('feature.' + self.feature_tp, 'normalizer'))
        else:
            raise ValueError("Feature type is incompatible.")
        input_dim = len(
            utils.read_pickle(
                config.get('feature.' + self.feature_tp, 'vocabulary')))
        self.eta = self.hp_params.eta
        feature_reverser = DrebinFeatureReverse()
        allow_insert_array, allow_removal_array = feature_reverser.get_mod_array(
        )

        inner_max_name = self.hp_params.maximizer_name
        inner_max_param = MAXIMIZER_PARAM_DICT[inner_max_name]
        self.inner_maximizer = MAXIMIZER_METHOD_DICT[inner_max_name](
            self,
            input_dim,
            allow_insert_array,
            allow_removal_array,
            self.normalizer,
            verbose=False,
            **inner_max_param)

        self.batch_size_mal = tf.Variable(0, dtype=tf.int64, trainable=False)
        super(AdversarialTrainingDNN,
              self).__init__(info_dict, hyper_params, reuse, self.is_saving,
                             self.init_graph, self.mode,
                             name + '_' + inner_max_name.upper())
Beispiel #11
0
def load_generated_problem(problem_name):
  problem = read_pickle(GENERATED_DIR + problem_name + EXTENSION)
  for attribute, value in ManipulationProblem(None).__dict__.items(): # TODO - older problems (ex 4tables) don't have all the problem attributes
    if not hasattr(problem, attribute):
      setattr(problem, attribute, value)
  return problem