Example #1
0
    def get_apk_info(self):
        apk = APK(self.apk_file)
        app_icon_file = apk.get_app_icon()
        app_icon_data = apk.get_file(app_icon_file)

        size = (256, 256)

        buffered = BytesIO()
        im = Image.open(BytesIO(app_icon_data))
        im = im.resize(size, Image.ANTIALIAS)
        im.save(buffered, "PNG")

        app_icon_b64 = "data:image/png;base64," + base64.b64encode(
            buffered.getvalue()).decode('utf-8')

        self.package_name = apk.get_package()
        self.app_name = apk.get_app_name()

        self.report_saver.package_name = self.package_name
        self.report_saver.app_name = self.app_name
        self.report_saver.version = apk.get_androidversion_code()
        self.report_saver.app_icon = app_icon_b64

        permission_parser = PermissionParser(mode='groups')
        permission_values = permission_parser.transform(
            apk.get_permissions()).flatten().tolist()
        permission_labels = permission_parser.labels()
        self.report_saver.permissions_actual = {
            permission_labels[i]: bool(v)
            for i, v in enumerate(permission_values)
        }
Example #2
0
    def __init__(self,
                 packages,
                 shuffle=True,
                 at_once=False,
                 verbose=True,
                 batch_size=None):
        self.log = logging.getLogger()

        self.batch_size = batch_size or config.Text2PermissionClassifier.batch_size
        self.packages = packages

        self.at_once = at_once  # for validation: no batches, calculate at once

        self.permissions_parser = PermissionParser(mode='groups')
        self.num_permissions = self.permissions_parser.count()

        self.shuffle = shuffle
        self.indexes = []
        self.db = SamplesDatabase.get()

        if verbose:
            print("Generator loaded: %d files" % len(self.packages))

        self.embedded_samples = EmbeddedSamples.get()
        self.on_epoch_end()
Example #3
0
 def __init__(self, generator, print_per_label=2):
     super().__init__()
     self.generator = generator
     self.permission_parser = PermissionParser('groups')
     num_batches = min(config.Text2PermissionClassifier.batch_size//self.permission_parser.count(),
                       len(self.generator))
     self.batch_indices = np.random.choice(len(self.generator), num_batches, replace=False)
     self.threshold_true = 0.65
     self.print_per_label = print_per_label
     self.db = SamplesDatabase.get()
Example #4
0
    def generate(self):
        warnings.simplefilter('ignore')
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

        if get_t2p_word_embedding_type() == "word2vec":
            key_description_num_tokens = "description_num_tokens_word2vec"
        else:
            key_description_num_tokens = "description_num_tokens_glove"
        package_names = self.db.filter(('lang', '==', 'en'),
                                  ('set', '==', 'test'),
                                  (key_description_num_tokens, '>=', 20))

        permission_parser = PermissionParser(mode='groups')
        model = model_multiconv_1d(permission_parser.count())

        model.compile(loss="binary_crossentropy",
                      optimizer='Adam',
                      metrics=[metrics.fb_micro, metrics.fb_micro, metrics.precision, metrics.recall])

        model.summary()

        # keras.utils.plot_model(model, '/home/me/model.png')

        model.load_weights(config.TrainedModels.text2permission)

        descriptions_limed = {}

        for i, package in enumerate(package_names):
            text_raw = self.db.read(package, 'description_raw')

            tokens, tokens_heat, preds = detect_relevant_word_inputs(text_raw,
                                                                     get_predict_fn(model),
                                                                     permission_parser.labels())
            descriptions_limed[package] = {
                'tokens': tokens,
                'tokens_heat': tokens_heat,
                'preds': preds
            }

            if i % 30 == 0:
                print("%d%%" % (i/len(package_names)*100))

            # if i == 3: break

        json.dump(descriptions_limed, open(config.Text2PermissionClassifier.test_set_lime, 'w'))

        self.descriptions_limed = descriptions_limed

        test_generator = Generator(packages=package_names, batch_size=64)
        print_metrics = PrintPerClassMetrics(test_generator)
        print_metrics.model = model
        print_metrics.predict_batch()
Example #5
0
def permission_class_weights(package_names):
    permission_parser = PermissionParser('groups')
    db = SamplesDatabase.get()

    count = np.zeros(shape=permission_parser.count())

    for p in package_names:
        count += permission_parser.transform(db.read(p, 'permissions'))

    weights = len(package_names)/(permission_parser.count() * count)
    weights = {i: w for i, w in enumerate(weights)}

    return weights
    def process_t2p(self):
        self.report_saver.t2p = {}
        permission_parser = PermissionParser('groups')
        ml_model = model_multiconv_1d(permission_parser.count())

        ml_model.compile(optimizer='adam', loss='mse', metrics=['mse'])
        ml_model.load_weights(config.TrainedModels.text2permission)

        tokens, tokens_heat, predictions = detect_relevant_word_inputs(
            self.text, get_predict_fn_t2p(ml_model),
            permission_parser.labels())

        self.report_saver.t2p['tokens'] = tokens
        self.report_saver.t2p['tokens_heat'] = tokens_heat
        self.report_saver.t2p['permissions_pred'] = predictions

        K.backend.clear_session()
Example #7
0
    def get_top_words_per_class(self, text_raw):
        #split_exp = PreTrainedEmbeddings.get().get_delimiter_regex_pattern()
        tokens, tokens_heat, preds = detect_relevant_word_inputs(text_raw.lower(),
                                                                 get_predict_fn(self.model),
                                                                 PermissionParser(mode='groups').labels())
        full_str = ""
        for class_name, heats in tokens_heat.items():
            tokens_with_heats = ["%s=%d%%" % (tokens[token_idx], heat) for token_idx, heat in heats if heat > 50][:5]
            if len(tokens_with_heats) > 0:
                full_str += "[%s] %s  " % (class_name[:5], " ".join(tokens_with_heats))

        return full_str
Example #8
0
class PrintPerClassMetrics(keras.callbacks.Callback):
    ''' outputs the performance of the validation set for each target flass '''

    def __init__(self, generator):
        super().__init__()
        self.generator = generator
        self.permission_parser = PermissionParser(mode='groups')

    def on_epoch_end(self, epoch, logs=None):
        print("##  VALIDATION METRICS")
        self.predict_batch()

    def predict_batch(self, print_report=True):
        num_samples_total = len(self.generator)*self.generator.batch_size
        num_permissions = self.permission_parser.count()
        y_true = np.zeros(shape=(num_samples_total, num_permissions))
        y_pred = np.zeros(shape=(num_samples_total, num_permissions))

        i = 0
        for X_batch, y_true_batch in self.generator:
            y_pred_batch = self.model.predict(X_batch)
            y_pred_batch = np.rint(y_pred_batch)

            n_batch_samples = y_pred_batch.shape[0]  # batch size or less (ultimate batch)

            y_true[i:i+n_batch_samples, :] = y_true_batch[:]
            y_pred[i:i+n_batch_samples, :] = y_pred_batch[:]

            i += self.generator.batch_size

        if y_true.shape[0] > 0 and y_pred.shape[0] > 0:
            report = class_report_fbeta(y_pred, y_true, self.permission_parser.labels(), 0.5, print_output=print_report)

            return report

        return None
Example #9
0
class Generator(keras.utils.Sequence):
    def __init__(self,
                 packages,
                 shuffle=True,
                 at_once=False,
                 verbose=True,
                 batch_size=None):
        self.log = logging.getLogger()

        self.batch_size = batch_size or config.Text2PermissionClassifier.batch_size
        self.packages = packages

        self.at_once = at_once  # for validation: no batches, calculate at once

        self.permissions_parser = PermissionParser(mode='groups')
        self.num_permissions = self.permissions_parser.count()

        self.shuffle = shuffle
        self.indexes = []
        self.db = SamplesDatabase.get()

        if verbose:
            print("Generator loaded: %d files" % len(self.packages))

        self.embedded_samples = EmbeddedSamples.get()
        self.on_epoch_end()

    def __len__(self):
        if self.at_once:
            return 1

        return int(np.floor(len(self.packages) / self.batch_size))

    def __getitem__(self, index):
        X, y, _ = self.get_item_and_package(index)
        return X, y

    def get_item_and_package(self, index):
        if self.at_once:
            packages_temp = self.packages
        else:
            indexes = self.indexes[index * self.batch_size:(index + 1) *
                                   self.batch_size]
            packages_temp = [self.packages[k] for k in indexes]

        X, y, metas = self.__data_generation(packages_temp, True)
        return X, y, metas

    def __data_generation(self, packages_temp, return_packages=False):
        embedding_idx_unknown = PreTrainedEmbeddings.get().get_unknown_idx()

        # Initialization
        num_samples = len(self.packages) if self.at_once else self.batch_size
        X = np.full(
            (num_samples,
             config.Text2PermissionClassifier.max_description_embeddings),
            fill_value=embedding_idx_unknown,
            dtype=np.int32)
        y = np.empty((num_samples, self.num_permissions), dtype=np.uint8)
        packages = []

        # Generate data
        for i, package in enumerate(packages_temp):
            embedding_indices = self.embedded_samples.get_embedded_indices(
                package)
            X[i, :len(embedding_indices)] = embedding_indices
            y[i] = self.permissions_parser.transform(
                self.db.read(package, 'permissions'))
            packages.append(package)

        if return_packages:
            return X, y, packages
        else:
            return X, y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.packages))
        if self.shuffle:
            np.random.shuffle(self.indexes)
        self.embedded_samples.on_epoch_end()
Example #10
0
 def __init__(self, generator):
     super().__init__()
     self.generator = generator
     self.permission_parser = PermissionParser(mode='groups')
Example #11
0
class PrintSamples(keras.callbacks.Callback):

    def __init__(self, generator, print_per_label=2):
        super().__init__()
        self.generator = generator
        self.permission_parser = PermissionParser('groups')
        num_batches = min(config.Text2PermissionClassifier.batch_size//self.permission_parser.count(),
                          len(self.generator))
        self.batch_indices = np.random.choice(len(self.generator), num_batches, replace=False)
        self.threshold_true = 0.65
        self.print_per_label = print_per_label
        self.db = SamplesDatabase.get()

    def on_epoch_end(self, epoch, logs=None):
        self.predict_batch()

    def predict_batch(self):
        num_permissions = self.permission_parser.count()

        # minimum number of positive (=1) samples per label (permission) to show

        num_print_left = {i: self.print_per_label for i in range(num_permissions)}

        output_table = StringTable()
        labels = self.permission_parser.labels()
        output_table.set_headline(labels)

        for batch_idx in self.batch_indices:
            X, y, packages = self.generator.get_item_and_package(batch_idx)
            p = self.model.predict(X)

            for i, package in enumerate(packages):
                predicted = np.rint(p[i])
                real = np.rint(y[i])

                print_this_sample = False

                for j in range(num_permissions):
                    if real[j] == 1 and num_print_left[j] > 0:
                        num_print_left[j] -= 1
                        print_this_sample = True

                if print_this_sample:
                    evals = [" %.2f--%d %s" %
                             (p[i][k], real[k], "ok" if predicted[k] == real[k] == 1 else "")
                             for k in range(real.shape[0])]
                    output_table.add_cells(evals)

                    output_table.add_cell(package)
                    output_table.add_cell(self.db.read(package, 'title'))

                    raw_text = SamplesDatabase.get().read(package, 'description_raw')
                    output_table.add_cell(self.get_top_words_per_class(raw_text))

                    output_table.new_row()

        output_table.set_cell_length(-1, 1000)
        for row in output_table.create_table(return_rows=True):
            print(row)

    def get_top_words_per_class(self, text_raw):
        #split_exp = PreTrainedEmbeddings.get().get_delimiter_regex_pattern()
        tokens, tokens_heat, preds = detect_relevant_word_inputs(text_raw.lower(),
                                                                 get_predict_fn(self.model),
                                                                 PermissionParser(mode='groups').labels())
        full_str = ""
        for class_name, heats in tokens_heat.items():
            tokens_with_heats = ["%s=%d%%" % (tokens[token_idx], heat) for token_idx, heat in heats if heat > 50][:5]
            if len(tokens_with_heats) > 0:
                full_str += "[%s] %s  " % (class_name[:5], " ".join(tokens_with_heats))

        return full_str
Example #12
0
def train(verbose=True, all_folds=False):

    if not verbose:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2'
        logging.set_verbosity(logging.ERROR)

    description_embedding_tokens_type = "description_num_tokens_%s" % get_t2p_word_embedding_type()

    db = SamplesDatabase.get()
    package_names = db.filter(#('lang', '==', 'en'),
                              ('set', '==', 'train+valid'),
                              (description_embedding_tokens_type, '>=', 20))
    random.shuffle(package_names)

    if verbose:
        print("packages from db with criteria: ", len(package_names))

    k_fold_splitter = KFold(n_splits=config.Text2PermissionClassifier.validation_split)

    k_reports_valid = []
    k_reports_test = []

    package_names_test = db.filter(('lang', '==', 'en'),
                                   ('set', '==', 'test'),
                                   (description_embedding_tokens_type, '>=', 20))
    random.shuffle(package_names_test)
    test_generator = Generator(packages=package_names_test, batch_size=128, verbose=False)

    keras.backend.clear_session()

    model = model_multiconv_1d(PermissionParser(mode='groups').count())

    for fold_number, (train_index, valid_index) in enumerate(k_fold_splitter.split(package_names)):
        print("FOLD:                 ", fold_number+1)

        packages_train = np.array(package_names)[train_index].tolist()
        packages_valid = np.array(package_names)[valid_index].tolist()

        model.compile(loss="binary_crossentropy",
                      optimizer=Adam(0.0001),
                      metrics=[metrics.fb_micro, metrics.fb_macro, metrics.precision, metrics.recall])
        train_metric = 'val_fb_macro'

        # keras.utils.plot_model(model, "model.png", show_shapes=True)
        if verbose and fold_number == 0:
            model.summary()

        train_generator = Generator(packages=packages_train, verbose=verbose)
        valid_generator = Generator(packages=packages_valid, batch_size=128, verbose=verbose)

        callbacks = [
            keras.callbacks.EarlyStopping(monitor=train_metric,
                                          mode='max',
                                          min_delta=config.Text2PermissionClassifier.early_stopping_delta,
                                          patience=config.Text2PermissionClassifier.early_stopping_patience,
                                          verbose=verbose),
            keras.callbacks.ModelCheckpoint(filepath=config.TrainedModels.text2permission,
                                            monitor=train_metric,
                                            mode='max',
                                            save_best_only=True,
                                            verbose=verbose)
        ]

        #if verbose:
        #    callbacks.append(PrintSamples(valid_generator, print_per_label=3))
        #    callbacks.append(PrintPerClassMetrics(valid_generator))

        model.fit_generator(train_generator,
                            epochs=config.Text2PermissionClassifier.max_train_epochs,
                            shuffle=True,
                            class_weight=permission_class_weights(packages_train),
                            validation_data=valid_generator,
                            use_multiprocessing=False,
                            verbose=verbose,
                            callbacks=callbacks
                            )

        model.load_weights(config.TrainedModels.text2permission)

        if verbose:
            print("-" * 80)
            print("-  done!")
            print("-" * 80)

            print("--- VALIDATION")

        print_metrics = PrintPerClassMetrics(valid_generator)
        print_metrics.model = model
        report_valid = print_metrics.predict_batch(print_report=verbose)

        if verbose:
            print("--- TEST")

        print_metrics = PrintPerClassMetrics(test_generator)
        print_metrics.model = model
        report_test = print_metrics.predict_batch(print_report=verbose)

        #if verbose:
        #    print_samples = PrintSamples(test_generator)
        #    print_samples.model = model
        #    print_samples.predict_batch()

        if not all_folds:
            return report_valid, report_test
        else:
            k_reports_valid.append(report_valid)
            k_reports_test.append(report_test)

    del model

    avg_reports_valid = {}
    avg_reports_test = {}

    for row in k_reports_valid[0].keys():
        for col in list(k_reports_valid[0].values())[0].keys():
            avg_reports_valid[row] = avg_reports_valid.get(row, {})
            avg_reports_valid[row][col] = mean([k_reports_valid[r].get(row, {}).get(col, 0.) or 0. for r in range(len(k_reports_valid))])
            avg_reports_test[row] = avg_reports_test.get(row, {})
            avg_reports_test[row][col] = mean([k_reports_test[r].get(row, {}).get(col, 0.) or 0. for r in range(len(k_reports_test))])

    if verbose:
        print("*" * 50)
        print(" - average over all %d folds" % k_fold_splitter.n_splits)
        print("*" * 50)
        print("VALIDATION")
        print_class_report_fbeta(avg_reports_valid)
        print()
        print("TEST")
        print_class_report_fbeta(avg_reports_test)

    return avg_reports_valid, avg_reports_test