def __init__(self, feature_mp='binary', use_default_feature=True): super(DrebinFeatureReverse, self).__init__(feature_type_scope_dict.keys()[0], feature_mp, use_default_feature) #load feature infomation try: if self.use_default_feature: self.normalizer = utils.read_pickle( cfg.config.get('feature.' + self.feature_type, 'normalizer')) self.vocab = utils.read_pickle( cfg.config.get('feature.' + self.feature_type, 'vocabulary')) self.vocab_info = utils.read_pickle( cfg.config.get('feature.' + self.feature_type, 'vocab_info')) else: # use surrogate feature meta-information self.normalizer = utils.read_pickle( os.path.join( cfg.config.get('experiments', 'surrogate_save_dir'), 'normalizer')) self.vocab = utils.read_pickle( os.path.join( cfg.config.get('experiments', 'surrogate_save_dir'), 'vocabulary')) self.vocab_info = utils.read_pickle( os.path.join( cfg.config.get('experiments', 'surrogate_save_dir'), 'vocab_info')) except Exception as ex: logger.error(str(ex)) raise IOError("Unable to load meta-information of feature.")
def __init__(self, info_dict=None, hyper_params_dict=None, reuse=False, name='SURROGATE'): if info_dict is None: print("Information of model should be provided.") return if hyper_params_dict is None: print("Hyper-parameters are needed.") return self.info_dict = info_dict self.info = utils.ParamWrapper(self.info_dict) self.hp_dict = hyper_params_dict self.hp_params = utils.ParamWrapper(self.hp_dict) self.feature_tp = self.info.feature_type self.feature_mp = self.info.feature_mapping_type self.feature_utility_rate = self.info.feature_utility_rate self.dataset_dir = self.info.dataset_dir self.name = name self.mal_dir = os.path.join(self.dataset_dir, config.get('dataset', 'malware_dir_name')) self.ben_dir = os.path.join(self.dataset_dir, config.get('dataset', 'benware_dir_name')) tmp_save_dir = config.get('experiments', 'surrogate_save_dir') if not os.path.exists(tmp_save_dir): os.mkdir(tmp_save_dir) self.save_dir = tmp_save_dir # self._data_preprocess() # model necessaries self.input_dim = len( utils.read_pickle(os.path.join( self.save_dir, 'vocabulary'))) # update in the future self.hidden_layers = self.hp_params.hidden_units self.output_dim = self.hp_params.output_dim # self.model_graph() super(SurrogateModel, self).__init__(self.info_dict, self.hp_dict, reuse, is_saving=False, name=self.name)
def normalize_data(X, is_fitting=False, feature_type='drebin'): if is_fitting: minmax_norm = MinMaxScaler() normalizer = minmax_norm.fit(X) utils.dump_pickle(normalizer, config.get('feature.' + feature_type, 'normalizer')) elif os.path.exists(config.get('feature.' + feature_type, 'normalizer')) and not is_fitting: normalizer = utils.read_pickle( config.get('feature.' + feature_type, 'normalizer')) else: raise ValueError("Unable to find the normalizer") feat_normlized = normalizer.transform(X) return feat_normlized
def feature_extraction(self, apk_paths, is_ordering=True): """ feature extraction @param apk_paths: the list of applications @param is_ordering: return the list of features corresponds to the apk_paths """ feature_save_dir = os.path.join("/tmp", "apk_data") if os.path.exists(feature_save_dir): # delete the files related to features shutil.rmtree(feature_save_dir, ignore_errors=True) # a loosely checking # file_number = len(os.listdir(feature_save_dir)) # assert file_number == len(apk_paths), "Feature extraction halts: there are feature files in directory '{}', and please remove it if it is not necessary anymore".format(feature_save_dir) get_droid_feature(apk_paths, feature_save_dir, feature_type=self.feature_tp) feature_mapping = FeatureMapping(feature_save_dir, feature_type=self.feature_tp) if is_ordering: feature = feature_mapping.preprocess_feature( is_ordering, apk_paths) else: feature = feature_mapping.preprocess_feature() if not os.path.exists( config.get('feature.' + self.feature_tp, 'vocabulary')): logger.warning("No vocabulary.") return np.array([]) vocab = utils.read_pickle( config.get('feature.' + self.feature_tp, 'vocabulary')) if self.feature_mp == 'count': return feature_mapping.count_feature_mapping_normalized( vocab, feature) else: return feature_mapping.binary_feature_mapping_normalized( vocab, feature)
def feature_extraction(self, apk_paths, inorder=True): feat_save_dir = os.path.join("/tmp", "apk_data") if os.path.exists(feat_save_dir): shutil.rmtree(feat_save_dir) get_droid_feature(apk_paths, feat_save_dir, feature_type=self.feature_tp) feature_mapping = FeatureMapping(feat_save_dir, feature_type=self.feature_tp) if inorder: feature = feature_mapping.preprocess_feature(inorder, apk_paths) else: feature = feature_mapping.preprocess_feature() if not os.path.exists(os.path.join(self.save_dir, 'vocabulary')): logger.info("No vocabulary.") return np.array([]) vocab = utils.read_pickle(os.path.join(self.save_dir, 'vocabulary')) if self.feature_mp == 'count': return feature_mapping.count_feature_mapping_normalized( vocab, feature) else: return feature_mapping.binary_feature_mapping_normalized( vocab, feature)
def generate_exc_malware_sample(self, perturbations=None, adv_save_dir=None): """Modify the apk based on the numeral perturbations""" assert isinstance(perturbations, np.ndarray) assert perturbations.shape[0] % len(self.attack_path_list) == 0 # Sample might have several perturbation vectors apk_paths = self.attack_path_list * (perturbations.shape[0] // len(self.attack_path_list)) mod_instr = self.feature_reverser.generate_mod_instruction( apk_paths, perturbations) modify_sample(mod_instr, adv_save_dir, proc_number=4, vb=False) if self.check: """ We check the perturbed APKs by comparing their feature representation to the perturbed representation """ adv_save_paths = [] for apk in self.attack_path_list: adv_save_paths.append( os.path.join(adv_save_dir, name_adv_file(apk) + '.apk')) adv_features = self.targeted_model.feature_extraction( adv_save_paths) pris_data_path = os.path.join( cfg.config.get('attack', self.attack_method_name), "pristine_{}.data".format( method_params_dict[self.attack_method_name].get('ord', ''))) if os.path.exists(pris_data_path): pris_feature_vectors = utils.readdata_np(pris_data_path) else: raise ValueError("No pristine data.") if len(adv_features) != len(pris_feature_vectors): logger.warning( "Expect the same number of adversarial and pristine feature vectors ({} vs. {})" .format(len(adv_features), len(pris_feature_vectors))) return None, perturbations if self.feature_reverser.normalizer is not None: _perturbations = np.rint(utils.normalize_inverse(adv_features, self.feature_reverser.normalizer)) - \ np.rint(utils.normalize_inverse(pris_feature_vectors, self.feature_reverser.normalizer)) else: _perturbations = adv_features - pris_feature_vectors if not np.all(np.abs(_perturbations - perturbations) <= 5e-1): logger.warning( "Unable to perturb some components exactly as generated perturbations." ) unequal_pos = (abs(_perturbations - perturbations) > 1e-6) vocab = utils.read_pickle( cfg.config.get('feature.' + self.targeted_model.feature_tp, 'vocabulary')) for i in range(len(unequal_pos)): if np.any(unequal_pos[i]): MSG_INFO = "Failed to perturb some features:" MSG_FILE = 'File name: {} with index {}'.format( apk_paths[i], i) MSG_res = 'Required perturbations {} vs. Resulting perturbations {} corresponds to elements:{}' MSG = MSG_INFO + '\n' + MSG_FILE + '\n' + \ MSG_res.format(perturbations[i, unequal_pos[i]], _perturbations[i, unequal_pos[i]], np.array(vocab)[unequal_pos[i]]) logger.warning(MSG) else: logger.info( "Perturbed APKs follow the generated perturbations exactly." ) return adv_features, perturbations else: return None, perturbations
def __init__(self, info_dict = None, hyper_params = None, reuse=False, is_saving = True, init_graph = True, mode = 'train', name = 'DADV_NN_ENSEMBLE_MAX'): """ hardened deep ensemble incorporated with ''max'' attack and a diversifying method @param info_dict: None, @param hyper_params: hyper parameters, @param reuse: reuse the variables or not @param is_saving: option for saving weights @param init_graph: initialize graph @param mode: enable a mode for run the model, 'train' or 'test' @param name: model name """ self.is_saving = is_saving self.init_graph = init_graph self.mode = mode if info_dict is None: ADV_ENS_INFO.update(INFO) info_dict = ADV_ENS_INFO self.clf_info = utils.ParamWrapper(info_dict) if hyper_params is None: ADV_ENS_HP.update(MAX_ADV_TRAIN_HP) ADV_ENS_HP.update(DNN_HP) hyper_params = ADV_ENS_HP self.hp_params = utils.ParamWrapper(hyper_params) self.model_name = name self.base_model_method = [AdversarialTrainingDNN] * len(MAXIMIZER_METHOD_DICT) self.base_model_method.append(BasicDNNModel) self.base_model_count = len(self.base_model_method) assert self.base_model_count > 1, 'one base model at least' # initialization if self.clf_info.feature_tp == feature_type_scope_dict.keys()[0]: self.normalizer = utils.read_pickle(config.get('feature.' + self.clf_info.feature_tp, 'normalizer')) else: raise ValueError("Feature type is incompatible.") input_dim = len(utils.read_pickle(config.get('feature.' + self.clf_info.feature_tp, 'vocabulary'))) self.eta = self.hp_params.eta feature_reverser = DrebinFeatureReverse() allow_insert_array, allow_removal_array = feature_reverser.get_mod_array() # build attack graph maximizer_name_list = self.hp_params.maximizer_name_list self.inner_maximizers = [] self.trial_list = [] for maximizer_name in maximizer_name_list: maximizer_method = MAXIMIZER_METHOD_DICT[maximizer_name] maximizer_param = MAXIMIZER_PARAM_DICT[maximizer_name] inner_maximizer = maximizer_method(self, input_dim, allow_insert_array, allow_removal_array, self.normalizer, verbose=False, **maximizer_param ) self.inner_maximizers.append(inner_maximizer) self.trial_list.append(self.hp_params.trials_dict[maximizer_name]) # record the number of malware examples in a training batch self.batch_size_mal = tf.Variable(0, dtype=tf.int64, trainable=False) super(DAdversarialDeepEnsembleMax, self).__init__(info_dict, hyper_params, reuse = reuse, is_saving=self.is_saving, init_graph= self.init_graph, mode = self.mode, name = name)
def __init__(self, info_dict=None, hyper_params=None, reuse=False, is_saving=True, init_graph=True, mode='train', name='BASIC_DNN'): """ build basic dnn model @param info_dict: None, @param hyper_params: hyper parameters, @param reuse: reuse the variables or not @param is_saving: option for saving weights @param init_graph: initialize graph @param mode: enable a mode for run the model, 'train' or 'test' @param name: model name """ super(BasicDNNModel, self).__init__() # model setup self.is_saving = is_saving self.init_graph = init_graph try: assert mode == 'train' or mode == 'test' except: raise AssertionError("'train' or 'test' mode, not others.") self.mode = mode if info_dict is not None: self.info_dict = info_dict else: self.info_dict = INFO self.info = utils.ParamWrapper(self.info_dict) if hyper_params is not None: self.hp_params_dict = hyper_params else: self.hp_params_dict = DNN_HP self.hp_params = utils.ParamWrapper(self.hp_params_dict) self.model_name = name if self.is_saving: self.save_dir = config.get('experiments', name.lower()) # feature extraction self.feature_tp = self.info.feature_type # drebin self.feature_mp = self.info.feature_mapping_type # binary self.dataset_dir = self.info.dataset_dir self.mal_dir = os.path.join(self.dataset_dir, config.get('dataset', 'malware_dir_name')) self.ben_dir = os.path.join(self.dataset_dir, config.get('dataset', 'benware_dir_name')) if not (os.path.exists( config.get('feature.' + self.feature_tp, 'dataX')) and os.path.exists( config.get('feature.' + self.feature_tp, 'datay')) and os.path.exists( config.get('feature.' + self.feature_tp, 'vocabulary')) and os.path.exists( config.get('feature.' + self.feature_tp, 'normalizer')) and os.path.exists(config.get('dataset', 'name_list'))): self._data_preprocess() # obtain some hyper-parameters self.input_dim = len( utils.read_pickle( config.get('feature.' + self.feature_tp, 'vocabulary'))) self.hidden_layers = self.hp_params.hidden_units self.output_dim = self.hp_params.output_dim tf.set_random_seed(self.hp_params.random_seed) if self.init_graph: self.model_graph(reuse=reuse)
def __init__(self, info_dict=None, hyper_params=None, reuse=False, is_saving=True, init_graph=True, mode='train', name='ADV_TRAINING_DNN_MAX'): """ hardened model incorporated with ''max'' attack @param info_dict: None, @param hyper_params: hyper parameters, @param reuse: reuse the variables or not @param is_saving: option for saving weights @param init_graph: initialize graph @param mode: enable a mode for run the model, 'train' or 'test' @param name: model name """ self.is_saving = is_saving self.init_graph = init_graph self.mode = mode if info_dict is None: info_dict = INFO self.info = utils.ParamWrapper( info_dict ) # get useful information, this will be over-wrote in father class self.feature_tp = self.info.feature_type if hyper_params is None: MAX_ADV_TRAIN_HP.update(DNN_HP) hyper_params = MAX_ADV_TRAIN_HP self.hp_params = utils.ParamWrapper(hyper_params) # initialization if self.feature_tp == feature_type_scope_dict.keys()[0]: self.normalizer = utils.read_pickle( config.get('feature.' + self.feature_tp, 'normalizer')) else: raise ValueError("Feature type is incompatible.") input_dim = len( utils.read_pickle( config.get('feature.' + self.feature_tp, 'vocabulary'))) self.eta = self.hp_params.eta feature_reverser = DrebinFeatureReverse() allow_insert_array, allow_removal_array = feature_reverser.get_mod_array( ) # build attack graph maximizer_name_list = self.hp_params.maximizer_name_list self.inner_maximizers = [] self.trial_list = [] for maximizer_name in maximizer_name_list: maximizer_method = MAXIMIZER_METHOD_DICT[maximizer_name] maximizer_param = MAXIMIZER_PARAM_DICT[maximizer_name] inner_maximizer = maximizer_method(self, input_dim, allow_insert_array, allow_removal_array, self.normalizer, verbose=False, **maximizer_param) self.inner_maximizers.append(inner_maximizer) self.trial_list.append(self.hp_params.trials_dict[maximizer_name]) # record the number of malware examples in a training batch self.batch_size_mal = tf.Variable(0, dtype=tf.int64, trainable=False) super(AdversarialTrainingDNNMax, self).__init__(info_dict, hyper_params, reuse, self.is_saving, self.init_graph, self.mode, name)
def __init__(self, info_dict=None, hyper_params=None, reuse=False, is_saving=True, init_graph=True, mode='train', name='ADV_TRAINING_DNN'): """ hardened model incorporated with adversarial training @param info_dict: None, @param hyper_params: hyper parameters, @param reuse: reuse the variables or not @param is_saving: option for saving weights @param init_graph: initialize graph @param mode: enable a mode for run the model, 'train' or 'test' @param name: model name """ self.is_saving = is_saving self.init_graph = init_graph self.mode = mode if info_dict is None: info_dict = INFO # get useful information, this will be over-wrote in other class self.info = utils.ParamWrapper(info_dict) self.feature_tp = self.info.feature_type if hyper_params is None: ADV_TRAIN_HP.update(DNN_HP) hyper_params = ADV_TRAIN_HP # hyper_params contains information of using which attack self.hp_params = utils.ParamWrapper(hyper_params) # initialization if self.feature_tp == feature_type_scope_dict.keys()[0]: self.normalizer = utils.read_pickle( config.get('feature.' + self.feature_tp, 'normalizer')) else: raise ValueError("Feature type is incompatible.") input_dim = len( utils.read_pickle( config.get('feature.' + self.feature_tp, 'vocabulary'))) self.eta = self.hp_params.eta feature_reverser = DrebinFeatureReverse() allow_insert_array, allow_removal_array = feature_reverser.get_mod_array( ) inner_max_name = self.hp_params.maximizer_name inner_max_param = MAXIMIZER_PARAM_DICT[inner_max_name] self.inner_maximizer = MAXIMIZER_METHOD_DICT[inner_max_name]( self, input_dim, allow_insert_array, allow_removal_array, self.normalizer, verbose=False, **inner_max_param) self.batch_size_mal = tf.Variable(0, dtype=tf.int64, trainable=False) super(AdversarialTrainingDNN, self).__init__(info_dict, hyper_params, reuse, self.is_saving, self.init_graph, self.mode, name + '_' + inner_max_name.upper())
def load_generated_problem(problem_name): problem = read_pickle(GENERATED_DIR + problem_name + EXTENSION) for attribute, value in ManipulationProblem(None).__dict__.items(): # TODO - older problems (ex 4tables) don't have all the problem attributes if not hasattr(problem, attribute): setattr(problem, attribute, value) return problem