コード例 #1
0
def normalize_data(X, is_fitting=False, feature_type='drebin'):
    if is_fitting:
        minmax_norm = MinMaxScaler()
        normalizer = minmax_norm.fit(X)
        utils.dump_pickle(normalizer,
                          config.get('feature.' + feature_type, 'normalizer'))
    elif os.path.exists(config.get('feature.' + feature_type,
                                   'normalizer')) and not is_fitting:
        normalizer = utils.read_pickle(
            config.get('feature.' + feature_type, 'normalizer'))
    else:
        raise ValueError("Unable to find the normalizer")
    feat_normlized = normalizer.transform(X)
    return feat_normlized
コード例 #2
0
    def _data_preprocess(self):
        """
        feature extraction
        """
        if (not os.path.exists(self.ben_dir)) and (not os.path.exists(
                self.mal_dir)):
            logger.error("directory '{}' or '{}' has no APK data.".format(
                self.ben_dir, self.mal_dir))
            return
        try:
            label_dict = self.get_label_dict()

            data_root_dir = config.get("dataset", "dataset_root")
            feat_save_dir = os.path.join(data_root_dir, "apk_data")
            get_droid_feature(self.ben_dir,
                              feat_save_dir,
                              feature_type=self.feature_tp)
            get_droid_feature(self.mal_dir,
                              feat_save_dir,
                              feature_type=self.feature_tp)

            feature_mapping = FeatureMapping(feat_save_dir,
                                             feature_type=self.feature_tp)
            naive_features, name_list = feature_mapping.load_features()

            if len(naive_features) == 0:
                logger.error("No features extracted.")
                return

            # remove S6: used permissions, this type of features depend on feature 'S7' APIs and feature 'S2' permission
            if not self.info.use_interdependent_features:
                naive_features = feature_mapping.remove_interdependent_featrues(
                    naive_features)

            gt_label = np.array([label_dict[os.path.splitext(name.strip())[0]] \
                                 for name in name_list])

            vocab, vocab_info_dict, features = feature_mapping.generate_vocab(
                naive_features)

            # feature splitting as training dataset, validation dataset, testing dataset
            train_features, test_features, train_y, test_y, train_name_list, test_name_list = \
                train_test_split(features, gt_label, name_list, test_size=0.2, random_state=0)
            train_features, val_features, train_y, val_y, train_name_list, val_name_list = \
                train_test_split(train_features, train_y, train_name_list, test_size=0.25, random_state=0)

            # select frequent features
            vocab_selected, vocab_info_dict_selcted = \
                feature_mapping.select_feature(train_features, train_y, vocab, vocab_info_dict, dim=10000)
            MSG = "After feature selection, the feature number is {} vs. {}".format(
                len(vocab_selected), len(vocab))
            logger.info(msg=MSG)

            if self.feature_mp == 'count':
                training_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, train_features, status='train')
                val_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, val_features)
                test_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, test_features)
            elif self.feature_mp == 'binary':
                training_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, train_features, status='train')
                val_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, val_features)
                test_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, test_features)
            else:
                raise ValueError("Not supported")

            # save features and feature representations
            utils.dump_pickle(
                vocab_selected,
                config.get('feature.' + self.feature_tp, 'vocabulary'))
            utils.dump_pickle(
                vocab_info_dict_selcted,
                config.get('feature.' + self.feature_tp, 'vocab_info'))
            utils.dump_joblib([
                training_feature_vectors, val_feature_vectors,
                test_feature_vectors
            ], config.get('feature.' + self.feature_tp, 'dataX'))
            utils.dump_joblib([train_y, val_y, test_y],
                              config.get('feature.' + self.feature_tp,
                                         'datay'))

            utils.write_whole_file(
                '\n'.join(train_name_list + val_name_list + test_name_list),
                config.get('dataset', 'name_list'))
        except Exception as ex:
            logger.error(str(ex))
            sys.exit(1)
コード例 #3
0
    def _data_preprocess(self):
        if (not os.path.exists(self.ben_dir)) and (not os.path.exists(
                self.mal_dir)):
            logger.error("directory '{}' or '{}' has no APK data.".format(
                self.ben_dir, self.mal_dir))
            return

        try:
            label_dict = self.get_label_dict()

            data_root_dir = config.get("dataset", "dataset_root")
            feat_save_dir = os.path.join(data_root_dir, "apk_data")
            get_droid_feature(self.ben_dir,
                              feat_save_dir,
                              feature_type=self.feature_tp)
            get_droid_feature(self.mal_dir,
                              feat_save_dir,
                              feature_type=self.feature_tp)

            feature_mapping = FeatureMapping(feat_save_dir,
                                             feature_type=self.feature_tp)
            naive_features, name_list = feature_mapping.load_features()
            gt_label = np.array([label_dict[os.path.splitext(name.strip())[0]] \
                                 for name in name_list])

            if len(naive_features) == 0:
                logger.error("No features extracted.")
                return

            if not self.info.use_interdependent_features:
                naive_features = feature_mapping.remove_interdependent_featrues(
                    naive_features)

            vocab, vocab_info_dict, feat_purified = feature_mapping.generate_vocab(
                naive_features)

            # feature splitting as training dataset, validation dataset,  testing dataset
            train_features, test_features, train_y, test_y, train_name_list, test_name_list = \
                train_test_split(feat_purified, gt_label, name_list, test_size=0.2, random_state=0)
            train_features, val_features, train_y, val_y, train_name_list, val_name_list = \
                train_test_split(train_features, train_y, train_name_list, test_size=0.25, random_state=0)

            # select features
            vocab_selected, vocab_info_dict_selcted = \
                feature_mapping.select_feature(train_features, train_y, vocab, vocab_info_dict, dim=10000)

            # feature preprocessing based on the feature utility rate
            if abs(self.feature_utility_rate - 1.) < 1e-10:
                naive_features = naive_features
            elif self.feature_utility_rate > 0. and self.feature_utility_rate < 1.:
                # todo
                pass
            else:
                raise ValueError

            if self.feature_mp == 'count':
                training_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, train_features, status='train')
                val_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, val_features)
                test_feature_vectors = \
                    feature_mapping.count_feature_mapping_normalized(vocab_selected, test_features)
            else:
                training_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, train_features, status='train')
                val_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, val_features)
                test_feature_vectors = \
                    feature_mapping.binary_feature_mapping_normalized(vocab_selected, test_features)

            utils.dump_pickle(vocab_selected,
                              os.path.join(self.save_dir, 'vocabulary'))
            utils.dump_pickle(vocab_info_dict_selcted,
                              os.path.join(self.save_dir, 'vocab_info'))
            utils.dump_joblib([
                training_feature_vectors, val_feature_vectors,
                test_feature_vectors
            ], os.path.join(self.save_dir, 'dataX'))
            utils.dump_joblib([train_y, val_y, test_y],
                              os.path.join(self.save_dir, 'datay'))
            utils.write_whole_file('\n'.join(name_list),
                                   os.path.join(self.save_dir, 'name_list'))
        except KeyError as ex:
            logger.error(str(ex))
            sys.exit(1)

        except Exception as ex:
            logger.error(str(ex))
            sys.exit(1)