def get_relevant_files(ext): SamplesDatabase.set_file('samples_database.pk') samples_props = SamplesDatabase.get() packages = samples_props.filter(('lang', '==', 'en')) doc_files = ["%s.%s" % (f, ext) for f in packages] return packages, doc_files
def from_samples_db(): test_packages = SamplesDatabase.get().filter(('set', '==', 'test')) # for p in test_packages: # print("-" * 50) # print(" - ", SamplesDatabase.get().read(p, 'title')) # print(" ", p) # print() # print(SamplesDatabase.get().read(p, 'description_raw')) # print() test_packages_1 = [ "com.dl.photo.loveframes", "mp3.tube.pro.free", "de.stohelit.folderplayer", "com.avg.cleaner", "com.gau.go.launcherex.gowidget.gopowermaster", "ru.mail", "de.shapeservices.impluslite", "kik.android", "com.hrs.b2c.android", "com.nqmobile.antivirus20.multilang", "com.yellowbook.android2", "com.antivirus.tablet", "taxi.android.client", "com.qihoo.security", "com.jb.gokeyboard.plugin.emoji", "com.niksoftware.snapseed", "com.forshared.music", "mobi.infolife.eraser", "com.hulu.plus", "com.vevo", "com.mobisystems.office", "com.whatsapp", "com.dropbox.android", "com.yahoo.mobile.client.android.yahoo", "com.jessdev.hdcameras", "com.slacker.radio", "com.jb.mms.theme.springtime", "ru.zdevs.zarchiver", "com.newsoftwares.folderlock_v1" ] test_packages = list(set(test_packages).difference(test_packages_1)) random.shuffle(test_packages) runner = TestsetEvaluator(report_folder="data/reports", package_names=test_packages) runner.run()
def print_sample_with_prediction(result, package_name): idx = result['package_names'].index(package_name) model = pickle.load(open(config.TFIDFModels.description_model_2, 'rb')) print(SamplesDatabase.get().read(package_name, 'title')) print(" ", list(map(lambda v: "%d%%" % int(100 * v), result['metrics'][idx][1:]))) top_actual_desc_indices = np.argsort(-np.array(result['y'][idx]))[:10].tolist() top_pred_desc_indices = np.argsort(-np.array(result['predictions'][idx]))[:10].tolist() tokens_actual = [model.get_feature_names()[i] for i in top_actual_desc_indices] values_actual = [result['y'][idx][i] for i in top_actual_desc_indices] tokens_predicted = [model.get_feature_names()[i] for i in top_pred_desc_indices] values_predicted = [result['predictions'][idx][i] for i in top_pred_desc_indices] print(" actual:") for token, value in zip(tokens_actual, values_actual): if value >= metrics._tfidf_discretize_threshold_descriptions: print(" %.2f %s" % (value, token)) print(" predicted:") for token, value in zip(tokens_predicted, values_predicted): if value >= metrics._tfidf_discretize_threshold_descriptions: print(" %.2f %s" % (value, token)) print()
def __init__(self, packages, shuffle=True, at_once=False, verbose=True, batch_size=None): self.log = logging.getLogger() self.batch_size = batch_size or config.Text2PermissionClassifier.batch_size self.packages = packages self.at_once = at_once # for validation: no batches, calculate at once self.permissions_parser = PermissionParser(mode='groups') self.num_permissions = self.permissions_parser.count() self.shuffle = shuffle self.indexes = [] self.db = SamplesDatabase.get() if verbose: print("Generator loaded: %d files" % len(self.packages)) self.embedded_samples = EmbeddedSamples.get() self.on_epoch_end()
def set_app_info(self, package_name): db = SamplesDatabase.get() if package_name is None: raise RuntimeError("no package name!") app_name = db.read(package_name, 'title') self.package_name = package_name self.app_name = app_name self.version = "" self.app_icon = None
def get_fallback_app_info(self): db = SamplesDatabase.get() if self.package_name is None: raise RuntimeError("no package name!") self.app_name = db.read(self.package_name, 'title') self.report_saver.package_name = self.package_name self.report_saver.app_name = self.app_name self.report_saver.version = "" self.report_saver.app_icon = None
def __init__(self, generator, print_per_label=2): super().__init__() self.generator = generator self.permission_parser = PermissionParser('groups') num_batches = min(config.Text2PermissionClassifier.batch_size//self.permission_parser.count(), len(self.generator)) self.batch_indices = np.random.choice(len(self.generator), num_batches, replace=False) self.threshold_true = 0.65 self.print_per_label = print_per_label self.db = SamplesDatabase.get()
def permission_class_weights(package_names): permission_parser = PermissionParser('groups') db = SamplesDatabase.get() count = np.zeros(shape=permission_parser.count()) for p in package_names: count += permission_parser.transform(db.read(p, 'permissions')) weights = len(package_names)/(permission_parser.count() * count) weights = {i: w for i, w in enumerate(weights)} return weights
def test_predictions_for_submodel(config_class, tfidf_data, tfidf_model, model_save_file): print("-" * 50) print("- ", config_class.__name__) print("-" * 50) m.model_config_class = config_class db = SamplesDatabase.get() package_names = db.filter(('lang', '==', 'en'), ('set', '==', 'test'), ('description_raw', 'len>', 100), ('cross_platform', '==', False)) keras.backend.clear_session() test_generator = TFIDFGenerator(tfidf_input_model_file=tfidf_model, tfidf_input_data_file=tfidf_data, package_names=package_names, batch_size=128, verbose=True) model = model_for_tf_idfs(test_generator.get_num_inputs(), test_generator.get_num_outputs()) model.compile(optimizer=Adam(lr=config_class.learning_rate, amsgrad=True), loss='mse', metrics=get_metrics_for_a2p_model()) model.load_weights(model_save_file) eval_values = model.evaluate_generator(test_generator, verbose=False) train_summary(eval_values, model, test_generator) result = { 'X': [], 'y': [], 'package_names': [], 'metrics': [], 'predictions': [] } for batch_index in range(len(test_generator)): X, y, package_names = test_generator.get_batch_and_details(batch_index, get_packages=True) metrics = [model.evaluate(X[i:i + 1], y[i:i + 1], verbose=False) for i in range(X.shape[0])] predictions = model.predict_on_batch(X) result['X'] += X.tolist() result['y'] += y.tolist() result['package_names'] += package_names result['metrics'] += metrics result['predictions'] += predictions.tolist() return result
def test(): from verifier.preprocessing.samples_database import SamplesDatabase e = PreTrainedEmbeddings.get() pt = e.get_delimiter_regex_pattern() db = SamplesDatabase.get() desc = db.read('com.whatsapp', 'description_raw') ts = re.split(pt, desc, flags=re.IGNORECASE) print(desc) print("|".join(ts)) ids = e.tokens_to_indices(ts) ewords = [e.index2word(id) for id in ids] print(" ".join(ewords))
def train(verbose=True, reduced_dataset=False): if not verbose: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2' logging.set_verbosity(logging.ERROR) db = SamplesDatabase.get() package_names = db.filter( ('lang', '==', 'en'), ('set', '==', 'train+valid'), ('description_raw', 'len>', 100), ('cross_platform', '==', False)) if reduced_dataset: random.seed(42) random.shuffle(package_names) package_names = package_names[:11000] random.seed(None) else: random.shuffle(package_names) # package_names = package_names[:3000] if verbose: print("# apps for training (db): ", len(package_names)) performance = [] if config.TFIDFClassifier.ModelStrings.enabled: performance = train_tfidf_classifier( config.TFIDFClassifier.ModelStrings, package_names, config.TFIDFModels.code_stringres_data, config.TFIDFModels.code_stringres_model, config.TrainedModels.app2text_stringres, verbose) if config.TFIDFClassifier.ModelResourceIds.enabled: performance = train_tfidf_classifier( config.TFIDFClassifier.ModelResourceIds, package_names, config.TFIDFModels.code_ids_data, config.TFIDFModels.code_ids_model, config.TrainedModels.app2text_ids, verbose) if config.TFIDFClassifier.ModelMethods.enabled: performance = train_tfidf_classifier( config.TFIDFClassifier.ModelMethods, package_names, config.TFIDFModels.code_methods_data, config.TFIDFModels.code_methods_model, config.TrainedModels.app2text_methods, verbose) return performance
def print_sample_stats(): db = SamplesDatabase.get() total = len(db.read(None, "title")) crossplatform = len(db.filter(('cross_platform', '==', True))) en = len(db.filter(('lang', '==', 'en'), ('description_raw', 'len>', 30))) downloads = len( db.filter( ('downloads', '>=', config.Clustering.min_downloads_visualize))) print("# total = ", total) print("# cross platform = ", crossplatform, " ", 100 * crossplatform / total, "%") print("# en > 30 = ", en, " ", 100 * en / total, "%") print("# downloads >= 4e6 = ", downloads, " ", 100 * downloads / total, "%")
def create(): # SamplesDatabase.set_file('samples_database.pk') db = SamplesDatabase.get() packages = db.filter(('lang', '==', 'en')) # [:20000] print("creating model") min_df_pct = 0.002 max_df_pct = 0.4 min_df = int(len(packages) * min_df_pct) max_df = int(len(packages) * max_df_pct) UnStemmer.enabled = True tfidf_model = TfidfVectorizer( tokenizer=meta_data_description_tokenize, min_df=min_df, max_df=max_df, ngram_range=(1, 3), lowercase=False, # done in tokenize function stop_words=get_stopwords_list()) tfidf_model.fit(packages) u = UnStemmer.get() print("transforming data") tfidf_data = tfidf_model.transform(packages) print("saving model: ", config.TFIDFModels.description_model_2) tfidf_model.vocabulary_ = { " ".join(map(lambda x: u.resolve(x), k.split(" "))): v for k, v in tfidf_model.vocabulary_.items() } pickle.dump(tfidf_model, open(config.TFIDFModels.description_model_2, "wb")) print("saved") print("saving data: ", config.TFIDFModels.description_data_2) save_data = {'ids': packages, 'data': tfidf_data} pickle.dump(save_data, open(config.TFIDFModels.description_data_2, "wb")) print("saved")
def predict_batch(self): num_permissions = self.permission_parser.count() # minimum number of positive (=1) samples per label (permission) to show num_print_left = {i: self.print_per_label for i in range(num_permissions)} output_table = StringTable() labels = self.permission_parser.labels() output_table.set_headline(labels) for batch_idx in self.batch_indices: X, y, packages = self.generator.get_item_and_package(batch_idx) p = self.model.predict(X) for i, package in enumerate(packages): predicted = np.rint(p[i]) real = np.rint(y[i]) print_this_sample = False for j in range(num_permissions): if real[j] == 1 and num_print_left[j] > 0: num_print_left[j] -= 1 print_this_sample = True if print_this_sample: evals = [" %.2f--%d %s" % (p[i][k], real[k], "ok" if predicted[k] == real[k] == 1 else "") for k in range(real.shape[0])] output_table.add_cells(evals) output_table.add_cell(package) output_table.add_cell(self.db.read(package, 'title')) raw_text = SamplesDatabase.get().read(package, 'description_raw') output_table.add_cell(self.get_top_words_per_class(raw_text)) output_table.new_row() output_table.set_cell_length(-1, 1000) for row in output_table.create_table(return_rows=True): print(row)
def meta_data_description_tokenize(package_name): text = SamplesDatabase.get().read(package_name, 'description_raw') text = html.unescape(text) tokens = tokenize_text(text) return tokens
def process_a2t_input(self, input_mode): print(input_mode) if input_mode == 'methods': model.model_config_class = config.TFIDFClassifier.ModelMethods tfidf_model_file = config.TFIDFModels.code_methods_model tfidf_data_file = config.TFIDFModels.code_methods_data ml_model_file = config.TrainedModels.app2text_methods elif input_mode == 'ids': model.model_config_class = config.TFIDFClassifier.ModelResourceIds tfidf_model_file = config.TFIDFModels.code_ids_model tfidf_data_file = config.TFIDFModels.code_ids_data ml_model_file = config.TrainedModels.app2text_ids elif input_mode == 'strings': model.model_config_class = config.TFIDFClassifier.ModelStrings tfidf_model_file = config.TFIDFModels.code_stringres_model tfidf_data_file = config.TFIDFModels.code_stringres_data ml_model_file = config.TrainedModels.app2text_stringres else: raise RuntimeError("unknown input mode") test_generator = TFIDFGenerator( tfidf_input_data_file=tfidf_data_file, tfidf_input_model_file=tfidf_model_file, package_names=self.package_names, batch_size=1, shuffle=False, verbose=True) print("loading ML model") K.backend.clear_session() ml_model = model_for_tf_idfs(test_generator.get_num_inputs(), test_generator.get_num_outputs()) ml_model.compile(optimizer='adam', loss='mse', metrics=['mse']) ml_model.load_weights(ml_model_file) # tensorflow.get_default_graph().finalize() print("predicting") y = ml_model.predict_generator(test_generator) for i in range(y.shape[0]): package_name = test_generator.package_names[i] descriptions_tfidf_model = test_generator.descriptions['model'] input_tfidf_model = test_generator.inputs input_tfidf_vector = test_generator.inputs.tfidf_data[ test_generator.inputs.doc_ids.index(package_name)] output_words = get_top_terms(y[i], descriptions_tfidf_model, top_k=self.n_top_terms) words_pred = {word: float(value) for word, value in output_words} descriptions_tfidf_indices = np.argsort( -y[i]).flatten().tolist()[:self.n_top_terms] influences = get_input_influences(input_tfidf_model, input_tfidf_vector, descriptions_tfidf_model, descriptions_tfidf_indices, ml_model) progress_pct = "%3d%% " % ((100 * (i + 1)) // y.shape[0]) print("-" * 80) print(progress_pct, package_name) print("-" * 80) if len(influences) > 0: for word, score in output_words[:7]: print("[%.3f] %s" % (score, word)) if not word in influences: continue for influence, score2 in influences[word][:7]: print(" [%.5f] %s" % (score2, influence)) print() report_saver = ReportSaver(report_folder=self.report_folder) report_saver.set_app_info(package_name) report_saver.a2t['words_pred'] = report_saver.a2t.get( 'words_pred', {}) report_saver.a2t['words_pred'][input_mode] = words_pred report_saver.a2t['input_values'] = report_saver.a2t.get( 'input_values', {}) report_saver.a2t['input_values'][input_mode] = influences report_saver.a2t['text_actual'] = SamplesDatabase.get().read( package_name, "description_raw") report_saver.save() K.backend.clear_session()
def get_fallback_description(self): self.text = SamplesDatabase.get().read(self.package_name, 'description_raw')
def train(verbose=True, all_folds=False): if not verbose: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2' logging.set_verbosity(logging.ERROR) description_embedding_tokens_type = "description_num_tokens_%s" % get_t2p_word_embedding_type() db = SamplesDatabase.get() package_names = db.filter(#('lang', '==', 'en'), ('set', '==', 'train+valid'), (description_embedding_tokens_type, '>=', 20)) random.shuffle(package_names) if verbose: print("packages from db with criteria: ", len(package_names)) k_fold_splitter = KFold(n_splits=config.Text2PermissionClassifier.validation_split) k_reports_valid = [] k_reports_test = [] package_names_test = db.filter(('lang', '==', 'en'), ('set', '==', 'test'), (description_embedding_tokens_type, '>=', 20)) random.shuffle(package_names_test) test_generator = Generator(packages=package_names_test, batch_size=128, verbose=False) keras.backend.clear_session() model = model_multiconv_1d(PermissionParser(mode='groups').count()) for fold_number, (train_index, valid_index) in enumerate(k_fold_splitter.split(package_names)): print("FOLD: ", fold_number+1) packages_train = np.array(package_names)[train_index].tolist() packages_valid = np.array(package_names)[valid_index].tolist() model.compile(loss="binary_crossentropy", optimizer=Adam(0.0001), metrics=[metrics.fb_micro, metrics.fb_macro, metrics.precision, metrics.recall]) train_metric = 'val_fb_macro' # keras.utils.plot_model(model, "model.png", show_shapes=True) if verbose and fold_number == 0: model.summary() train_generator = Generator(packages=packages_train, verbose=verbose) valid_generator = Generator(packages=packages_valid, batch_size=128, verbose=verbose) callbacks = [ keras.callbacks.EarlyStopping(monitor=train_metric, mode='max', min_delta=config.Text2PermissionClassifier.early_stopping_delta, patience=config.Text2PermissionClassifier.early_stopping_patience, verbose=verbose), keras.callbacks.ModelCheckpoint(filepath=config.TrainedModels.text2permission, monitor=train_metric, mode='max', save_best_only=True, verbose=verbose) ] #if verbose: # callbacks.append(PrintSamples(valid_generator, print_per_label=3)) # callbacks.append(PrintPerClassMetrics(valid_generator)) model.fit_generator(train_generator, epochs=config.Text2PermissionClassifier.max_train_epochs, shuffle=True, class_weight=permission_class_weights(packages_train), validation_data=valid_generator, use_multiprocessing=False, verbose=verbose, callbacks=callbacks ) model.load_weights(config.TrainedModels.text2permission) if verbose: print("-" * 80) print("- done!") print("-" * 80) print("--- VALIDATION") print_metrics = PrintPerClassMetrics(valid_generator) print_metrics.model = model report_valid = print_metrics.predict_batch(print_report=verbose) if verbose: print("--- TEST") print_metrics = PrintPerClassMetrics(test_generator) print_metrics.model = model report_test = print_metrics.predict_batch(print_report=verbose) #if verbose: # print_samples = PrintSamples(test_generator) # print_samples.model = model # print_samples.predict_batch() if not all_folds: return report_valid, report_test else: k_reports_valid.append(report_valid) k_reports_test.append(report_test) del model avg_reports_valid = {} avg_reports_test = {} for row in k_reports_valid[0].keys(): for col in list(k_reports_valid[0].values())[0].keys(): avg_reports_valid[row] = avg_reports_valid.get(row, {}) avg_reports_valid[row][col] = mean([k_reports_valid[r].get(row, {}).get(col, 0.) or 0. for r in range(len(k_reports_valid))]) avg_reports_test[row] = avg_reports_test.get(row, {}) avg_reports_test[row][col] = mean([k_reports_test[r].get(row, {}).get(col, 0.) or 0. for r in range(len(k_reports_test))]) if verbose: print("*" * 50) print(" - average over all %d folds" % k_fold_splitter.n_splits) print("*" * 50) print("VALIDATION") print_class_report_fbeta(avg_reports_valid) print() print("TEST") print_class_report_fbeta(avg_reports_test) return avg_reports_valid, avg_reports_test
def __init__(self): self.db = SamplesDatabase.get() self.load() self.checked_if_up_to_date = False self.num_added = 0
def __init__(self): self.descriptions_limed = None self.db = SamplesDatabase.get()