def normalize_data(X, is_fitting=False, feature_type='drebin'): if is_fitting: minmax_norm = MinMaxScaler() normalizer = minmax_norm.fit(X) utils.dump_pickle(normalizer, config.get('feature.' + feature_type, 'normalizer')) elif os.path.exists(config.get('feature.' + feature_type, 'normalizer')) and not is_fitting: normalizer = utils.read_pickle( config.get('feature.' + feature_type, 'normalizer')) else: raise ValueError("Unable to find the normalizer") feat_normlized = normalizer.transform(X) return feat_normlized
def _data_preprocess(self): """ feature extraction """ if (not os.path.exists(self.ben_dir)) and (not os.path.exists( self.mal_dir)): logger.error("directory '{}' or '{}' has no APK data.".format( self.ben_dir, self.mal_dir)) return try: label_dict = self.get_label_dict() data_root_dir = config.get("dataset", "dataset_root") feat_save_dir = os.path.join(data_root_dir, "apk_data") get_droid_feature(self.ben_dir, feat_save_dir, feature_type=self.feature_tp) get_droid_feature(self.mal_dir, feat_save_dir, feature_type=self.feature_tp) feature_mapping = FeatureMapping(feat_save_dir, feature_type=self.feature_tp) naive_features, name_list = feature_mapping.load_features() if len(naive_features) == 0: logger.error("No features extracted.") return # remove S6: used permissions, this type of features depend on feature 'S7' APIs and feature 'S2' permission if not self.info.use_interdependent_features: naive_features = feature_mapping.remove_interdependent_featrues( naive_features) gt_label = np.array([label_dict[os.path.splitext(name.strip())[0]] \ for name in name_list]) vocab, vocab_info_dict, features = feature_mapping.generate_vocab( naive_features) # feature splitting as training dataset, validation dataset, testing dataset train_features, test_features, train_y, test_y, train_name_list, test_name_list = \ train_test_split(features, gt_label, name_list, test_size=0.2, random_state=0) train_features, val_features, train_y, val_y, train_name_list, val_name_list = \ train_test_split(train_features, train_y, train_name_list, test_size=0.25, random_state=0) # select frequent features vocab_selected, vocab_info_dict_selcted = \ feature_mapping.select_feature(train_features, train_y, vocab, vocab_info_dict, dim=10000) MSG = "After feature selection, the feature number is {} vs. {}".format( len(vocab_selected), len(vocab)) logger.info(msg=MSG) if self.feature_mp == 'count': training_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, train_features, status='train') val_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, val_features) test_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, test_features) elif self.feature_mp == 'binary': training_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, train_features, status='train') val_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, val_features) test_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, test_features) else: raise ValueError("Not supported") # save features and feature representations utils.dump_pickle( vocab_selected, config.get('feature.' + self.feature_tp, 'vocabulary')) utils.dump_pickle( vocab_info_dict_selcted, config.get('feature.' + self.feature_tp, 'vocab_info')) utils.dump_joblib([ training_feature_vectors, val_feature_vectors, test_feature_vectors ], config.get('feature.' + self.feature_tp, 'dataX')) utils.dump_joblib([train_y, val_y, test_y], config.get('feature.' + self.feature_tp, 'datay')) utils.write_whole_file( '\n'.join(train_name_list + val_name_list + test_name_list), config.get('dataset', 'name_list')) except Exception as ex: logger.error(str(ex)) sys.exit(1)
def _data_preprocess(self): if (not os.path.exists(self.ben_dir)) and (not os.path.exists( self.mal_dir)): logger.error("directory '{}' or '{}' has no APK data.".format( self.ben_dir, self.mal_dir)) return try: label_dict = self.get_label_dict() data_root_dir = config.get("dataset", "dataset_root") feat_save_dir = os.path.join(data_root_dir, "apk_data") get_droid_feature(self.ben_dir, feat_save_dir, feature_type=self.feature_tp) get_droid_feature(self.mal_dir, feat_save_dir, feature_type=self.feature_tp) feature_mapping = FeatureMapping(feat_save_dir, feature_type=self.feature_tp) naive_features, name_list = feature_mapping.load_features() gt_label = np.array([label_dict[os.path.splitext(name.strip())[0]] \ for name in name_list]) if len(naive_features) == 0: logger.error("No features extracted.") return if not self.info.use_interdependent_features: naive_features = feature_mapping.remove_interdependent_featrues( naive_features) vocab, vocab_info_dict, feat_purified = feature_mapping.generate_vocab( naive_features) # feature splitting as training dataset, validation dataset, testing dataset train_features, test_features, train_y, test_y, train_name_list, test_name_list = \ train_test_split(feat_purified, gt_label, name_list, test_size=0.2, random_state=0) train_features, val_features, train_y, val_y, train_name_list, val_name_list = \ train_test_split(train_features, train_y, train_name_list, test_size=0.25, random_state=0) # select features vocab_selected, vocab_info_dict_selcted = \ feature_mapping.select_feature(train_features, train_y, vocab, vocab_info_dict, dim=10000) # feature preprocessing based on the feature utility rate if abs(self.feature_utility_rate - 1.) < 1e-10: naive_features = naive_features elif self.feature_utility_rate > 0. and self.feature_utility_rate < 1.: # todo pass else: raise ValueError if self.feature_mp == 'count': training_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, train_features, status='train') val_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, val_features) test_feature_vectors = \ feature_mapping.count_feature_mapping_normalized(vocab_selected, test_features) else: training_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, train_features, status='train') val_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, val_features) test_feature_vectors = \ feature_mapping.binary_feature_mapping_normalized(vocab_selected, test_features) utils.dump_pickle(vocab_selected, os.path.join(self.save_dir, 'vocabulary')) utils.dump_pickle(vocab_info_dict_selcted, os.path.join(self.save_dir, 'vocab_info')) utils.dump_joblib([ training_feature_vectors, val_feature_vectors, test_feature_vectors ], os.path.join(self.save_dir, 'dataX')) utils.dump_joblib([train_y, val_y, test_y], os.path.join(self.save_dir, 'datay')) utils.write_whole_file('\n'.join(name_list), os.path.join(self.save_dir, 'name_list')) except KeyError as ex: logger.error(str(ex)) sys.exit(1) except Exception as ex: logger.error(str(ex)) sys.exit(1)