def get_apk_info(self): apk = APK(self.apk_file) app_icon_file = apk.get_app_icon() app_icon_data = apk.get_file(app_icon_file) size = (256, 256) buffered = BytesIO() im = Image.open(BytesIO(app_icon_data)) im = im.resize(size, Image.ANTIALIAS) im.save(buffered, "PNG") app_icon_b64 = "data:image/png;base64," + base64.b64encode( buffered.getvalue()).decode('utf-8') self.package_name = apk.get_package() self.app_name = apk.get_app_name() self.report_saver.package_name = self.package_name self.report_saver.app_name = self.app_name self.report_saver.version = apk.get_androidversion_code() self.report_saver.app_icon = app_icon_b64 permission_parser = PermissionParser(mode='groups') permission_values = permission_parser.transform( apk.get_permissions()).flatten().tolist() permission_labels = permission_parser.labels() self.report_saver.permissions_actual = { permission_labels[i]: bool(v) for i, v in enumerate(permission_values) }
def __init__(self, packages, shuffle=True, at_once=False, verbose=True, batch_size=None): self.log = logging.getLogger() self.batch_size = batch_size or config.Text2PermissionClassifier.batch_size self.packages = packages self.at_once = at_once # for validation: no batches, calculate at once self.permissions_parser = PermissionParser(mode='groups') self.num_permissions = self.permissions_parser.count() self.shuffle = shuffle self.indexes = [] self.db = SamplesDatabase.get() if verbose: print("Generator loaded: %d files" % len(self.packages)) self.embedded_samples = EmbeddedSamples.get() self.on_epoch_end()
def __init__(self, generator, print_per_label=2): super().__init__() self.generator = generator self.permission_parser = PermissionParser('groups') num_batches = min(config.Text2PermissionClassifier.batch_size//self.permission_parser.count(), len(self.generator)) self.batch_indices = np.random.choice(len(self.generator), num_batches, replace=False) self.threshold_true = 0.65 self.print_per_label = print_per_label self.db = SamplesDatabase.get()
def generate(self): warnings.simplefilter('ignore') os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' if get_t2p_word_embedding_type() == "word2vec": key_description_num_tokens = "description_num_tokens_word2vec" else: key_description_num_tokens = "description_num_tokens_glove" package_names = self.db.filter(('lang', '==', 'en'), ('set', '==', 'test'), (key_description_num_tokens, '>=', 20)) permission_parser = PermissionParser(mode='groups') model = model_multiconv_1d(permission_parser.count()) model.compile(loss="binary_crossentropy", optimizer='Adam', metrics=[metrics.fb_micro, metrics.fb_micro, metrics.precision, metrics.recall]) model.summary() # keras.utils.plot_model(model, '/home/me/model.png') model.load_weights(config.TrainedModels.text2permission) descriptions_limed = {} for i, package in enumerate(package_names): text_raw = self.db.read(package, 'description_raw') tokens, tokens_heat, preds = detect_relevant_word_inputs(text_raw, get_predict_fn(model), permission_parser.labels()) descriptions_limed[package] = { 'tokens': tokens, 'tokens_heat': tokens_heat, 'preds': preds } if i % 30 == 0: print("%d%%" % (i/len(package_names)*100)) # if i == 3: break json.dump(descriptions_limed, open(config.Text2PermissionClassifier.test_set_lime, 'w')) self.descriptions_limed = descriptions_limed test_generator = Generator(packages=package_names, batch_size=64) print_metrics = PrintPerClassMetrics(test_generator) print_metrics.model = model print_metrics.predict_batch()
def permission_class_weights(package_names): permission_parser = PermissionParser('groups') db = SamplesDatabase.get() count = np.zeros(shape=permission_parser.count()) for p in package_names: count += permission_parser.transform(db.read(p, 'permissions')) weights = len(package_names)/(permission_parser.count() * count) weights = {i: w for i, w in enumerate(weights)} return weights
def process_t2p(self): self.report_saver.t2p = {} permission_parser = PermissionParser('groups') ml_model = model_multiconv_1d(permission_parser.count()) ml_model.compile(optimizer='adam', loss='mse', metrics=['mse']) ml_model.load_weights(config.TrainedModels.text2permission) tokens, tokens_heat, predictions = detect_relevant_word_inputs( self.text, get_predict_fn_t2p(ml_model), permission_parser.labels()) self.report_saver.t2p['tokens'] = tokens self.report_saver.t2p['tokens_heat'] = tokens_heat self.report_saver.t2p['permissions_pred'] = predictions K.backend.clear_session()
def get_top_words_per_class(self, text_raw): #split_exp = PreTrainedEmbeddings.get().get_delimiter_regex_pattern() tokens, tokens_heat, preds = detect_relevant_word_inputs(text_raw.lower(), get_predict_fn(self.model), PermissionParser(mode='groups').labels()) full_str = "" for class_name, heats in tokens_heat.items(): tokens_with_heats = ["%s=%d%%" % (tokens[token_idx], heat) for token_idx, heat in heats if heat > 50][:5] if len(tokens_with_heats) > 0: full_str += "[%s] %s " % (class_name[:5], " ".join(tokens_with_heats)) return full_str
class PrintPerClassMetrics(keras.callbacks.Callback): ''' outputs the performance of the validation set for each target flass ''' def __init__(self, generator): super().__init__() self.generator = generator self.permission_parser = PermissionParser(mode='groups') def on_epoch_end(self, epoch, logs=None): print("## VALIDATION METRICS") self.predict_batch() def predict_batch(self, print_report=True): num_samples_total = len(self.generator)*self.generator.batch_size num_permissions = self.permission_parser.count() y_true = np.zeros(shape=(num_samples_total, num_permissions)) y_pred = np.zeros(shape=(num_samples_total, num_permissions)) i = 0 for X_batch, y_true_batch in self.generator: y_pred_batch = self.model.predict(X_batch) y_pred_batch = np.rint(y_pred_batch) n_batch_samples = y_pred_batch.shape[0] # batch size or less (ultimate batch) y_true[i:i+n_batch_samples, :] = y_true_batch[:] y_pred[i:i+n_batch_samples, :] = y_pred_batch[:] i += self.generator.batch_size if y_true.shape[0] > 0 and y_pred.shape[0] > 0: report = class_report_fbeta(y_pred, y_true, self.permission_parser.labels(), 0.5, print_output=print_report) return report return None
class Generator(keras.utils.Sequence): def __init__(self, packages, shuffle=True, at_once=False, verbose=True, batch_size=None): self.log = logging.getLogger() self.batch_size = batch_size or config.Text2PermissionClassifier.batch_size self.packages = packages self.at_once = at_once # for validation: no batches, calculate at once self.permissions_parser = PermissionParser(mode='groups') self.num_permissions = self.permissions_parser.count() self.shuffle = shuffle self.indexes = [] self.db = SamplesDatabase.get() if verbose: print("Generator loaded: %d files" % len(self.packages)) self.embedded_samples = EmbeddedSamples.get() self.on_epoch_end() def __len__(self): if self.at_once: return 1 return int(np.floor(len(self.packages) / self.batch_size)) def __getitem__(self, index): X, y, _ = self.get_item_and_package(index) return X, y def get_item_and_package(self, index): if self.at_once: packages_temp = self.packages else: indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size] packages_temp = [self.packages[k] for k in indexes] X, y, metas = self.__data_generation(packages_temp, True) return X, y, metas def __data_generation(self, packages_temp, return_packages=False): embedding_idx_unknown = PreTrainedEmbeddings.get().get_unknown_idx() # Initialization num_samples = len(self.packages) if self.at_once else self.batch_size X = np.full( (num_samples, config.Text2PermissionClassifier.max_description_embeddings), fill_value=embedding_idx_unknown, dtype=np.int32) y = np.empty((num_samples, self.num_permissions), dtype=np.uint8) packages = [] # Generate data for i, package in enumerate(packages_temp): embedding_indices = self.embedded_samples.get_embedded_indices( package) X[i, :len(embedding_indices)] = embedding_indices y[i] = self.permissions_parser.transform( self.db.read(package, 'permissions')) packages.append(package) if return_packages: return X, y, packages else: return X, y def on_epoch_end(self): self.indexes = np.arange(len(self.packages)) if self.shuffle: np.random.shuffle(self.indexes) self.embedded_samples.on_epoch_end()
def __init__(self, generator): super().__init__() self.generator = generator self.permission_parser = PermissionParser(mode='groups')
class PrintSamples(keras.callbacks.Callback): def __init__(self, generator, print_per_label=2): super().__init__() self.generator = generator self.permission_parser = PermissionParser('groups') num_batches = min(config.Text2PermissionClassifier.batch_size//self.permission_parser.count(), len(self.generator)) self.batch_indices = np.random.choice(len(self.generator), num_batches, replace=False) self.threshold_true = 0.65 self.print_per_label = print_per_label self.db = SamplesDatabase.get() def on_epoch_end(self, epoch, logs=None): self.predict_batch() def predict_batch(self): num_permissions = self.permission_parser.count() # minimum number of positive (=1) samples per label (permission) to show num_print_left = {i: self.print_per_label for i in range(num_permissions)} output_table = StringTable() labels = self.permission_parser.labels() output_table.set_headline(labels) for batch_idx in self.batch_indices: X, y, packages = self.generator.get_item_and_package(batch_idx) p = self.model.predict(X) for i, package in enumerate(packages): predicted = np.rint(p[i]) real = np.rint(y[i]) print_this_sample = False for j in range(num_permissions): if real[j] == 1 and num_print_left[j] > 0: num_print_left[j] -= 1 print_this_sample = True if print_this_sample: evals = [" %.2f--%d %s" % (p[i][k], real[k], "ok" if predicted[k] == real[k] == 1 else "") for k in range(real.shape[0])] output_table.add_cells(evals) output_table.add_cell(package) output_table.add_cell(self.db.read(package, 'title')) raw_text = SamplesDatabase.get().read(package, 'description_raw') output_table.add_cell(self.get_top_words_per_class(raw_text)) output_table.new_row() output_table.set_cell_length(-1, 1000) for row in output_table.create_table(return_rows=True): print(row) def get_top_words_per_class(self, text_raw): #split_exp = PreTrainedEmbeddings.get().get_delimiter_regex_pattern() tokens, tokens_heat, preds = detect_relevant_word_inputs(text_raw.lower(), get_predict_fn(self.model), PermissionParser(mode='groups').labels()) full_str = "" for class_name, heats in tokens_heat.items(): tokens_with_heats = ["%s=%d%%" % (tokens[token_idx], heat) for token_idx, heat in heats if heat > 50][:5] if len(tokens_with_heats) > 0: full_str += "[%s] %s " % (class_name[:5], " ".join(tokens_with_heats)) return full_str
def train(verbose=True, all_folds=False): if not verbose: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2' logging.set_verbosity(logging.ERROR) description_embedding_tokens_type = "description_num_tokens_%s" % get_t2p_word_embedding_type() db = SamplesDatabase.get() package_names = db.filter(#('lang', '==', 'en'), ('set', '==', 'train+valid'), (description_embedding_tokens_type, '>=', 20)) random.shuffle(package_names) if verbose: print("packages from db with criteria: ", len(package_names)) k_fold_splitter = KFold(n_splits=config.Text2PermissionClassifier.validation_split) k_reports_valid = [] k_reports_test = [] package_names_test = db.filter(('lang', '==', 'en'), ('set', '==', 'test'), (description_embedding_tokens_type, '>=', 20)) random.shuffle(package_names_test) test_generator = Generator(packages=package_names_test, batch_size=128, verbose=False) keras.backend.clear_session() model = model_multiconv_1d(PermissionParser(mode='groups').count()) for fold_number, (train_index, valid_index) in enumerate(k_fold_splitter.split(package_names)): print("FOLD: ", fold_number+1) packages_train = np.array(package_names)[train_index].tolist() packages_valid = np.array(package_names)[valid_index].tolist() model.compile(loss="binary_crossentropy", optimizer=Adam(0.0001), metrics=[metrics.fb_micro, metrics.fb_macro, metrics.precision, metrics.recall]) train_metric = 'val_fb_macro' # keras.utils.plot_model(model, "model.png", show_shapes=True) if verbose and fold_number == 0: model.summary() train_generator = Generator(packages=packages_train, verbose=verbose) valid_generator = Generator(packages=packages_valid, batch_size=128, verbose=verbose) callbacks = [ keras.callbacks.EarlyStopping(monitor=train_metric, mode='max', min_delta=config.Text2PermissionClassifier.early_stopping_delta, patience=config.Text2PermissionClassifier.early_stopping_patience, verbose=verbose), keras.callbacks.ModelCheckpoint(filepath=config.TrainedModels.text2permission, monitor=train_metric, mode='max', save_best_only=True, verbose=verbose) ] #if verbose: # callbacks.append(PrintSamples(valid_generator, print_per_label=3)) # callbacks.append(PrintPerClassMetrics(valid_generator)) model.fit_generator(train_generator, epochs=config.Text2PermissionClassifier.max_train_epochs, shuffle=True, class_weight=permission_class_weights(packages_train), validation_data=valid_generator, use_multiprocessing=False, verbose=verbose, callbacks=callbacks ) model.load_weights(config.TrainedModels.text2permission) if verbose: print("-" * 80) print("- done!") print("-" * 80) print("--- VALIDATION") print_metrics = PrintPerClassMetrics(valid_generator) print_metrics.model = model report_valid = print_metrics.predict_batch(print_report=verbose) if verbose: print("--- TEST") print_metrics = PrintPerClassMetrics(test_generator) print_metrics.model = model report_test = print_metrics.predict_batch(print_report=verbose) #if verbose: # print_samples = PrintSamples(test_generator) # print_samples.model = model # print_samples.predict_batch() if not all_folds: return report_valid, report_test else: k_reports_valid.append(report_valid) k_reports_test.append(report_test) del model avg_reports_valid = {} avg_reports_test = {} for row in k_reports_valid[0].keys(): for col in list(k_reports_valid[0].values())[0].keys(): avg_reports_valid[row] = avg_reports_valid.get(row, {}) avg_reports_valid[row][col] = mean([k_reports_valid[r].get(row, {}).get(col, 0.) or 0. for r in range(len(k_reports_valid))]) avg_reports_test[row] = avg_reports_test.get(row, {}) avg_reports_test[row][col] = mean([k_reports_test[r].get(row, {}).get(col, 0.) or 0. for r in range(len(k_reports_test))]) if verbose: print("*" * 50) print(" - average over all %d folds" % k_fold_splitter.n_splits) print("*" * 50) print("VALIDATION") print_class_report_fbeta(avg_reports_valid) print() print("TEST") print_class_report_fbeta(avg_reports_test) return avg_reports_valid, avg_reports_test