コード例 #1
0
def get_relevant_files(ext):
    SamplesDatabase.set_file('samples_database.pk')
    samples_props = SamplesDatabase.get()
    packages = samples_props.filter(('lang', '==', 'en'))
    doc_files = ["%s.%s" % (f, ext) for f in packages]

    return packages, doc_files
コード例 #2
0
def from_samples_db():
    test_packages = SamplesDatabase.get().filter(('set', '==', 'test'))

    # for p in test_packages:
    #     print("-" * 50)
    #     print(" - ", SamplesDatabase.get().read(p, 'title'))
    #     print("   ", p)
    #     print()
    #     print(SamplesDatabase.get().read(p, 'description_raw'))
    #     print()

    test_packages_1 = [
        "com.dl.photo.loveframes", "mp3.tube.pro.free",
        "de.stohelit.folderplayer", "com.avg.cleaner",
        "com.gau.go.launcherex.gowidget.gopowermaster", "ru.mail",
        "de.shapeservices.impluslite", "kik.android", "com.hrs.b2c.android",
        "com.nqmobile.antivirus20.multilang", "com.yellowbook.android2",
        "com.antivirus.tablet", "taxi.android.client", "com.qihoo.security",
        "com.jb.gokeyboard.plugin.emoji", "com.niksoftware.snapseed",
        "com.forshared.music", "mobi.infolife.eraser", "com.hulu.plus",
        "com.vevo", "com.mobisystems.office", "com.whatsapp",
        "com.dropbox.android", "com.yahoo.mobile.client.android.yahoo",
        "com.jessdev.hdcameras", "com.slacker.radio",
        "com.jb.mms.theme.springtime", "ru.zdevs.zarchiver",
        "com.newsoftwares.folderlock_v1"
    ]

    test_packages = list(set(test_packages).difference(test_packages_1))

    random.shuffle(test_packages)

    runner = TestsetEvaluator(report_folder="data/reports",
                              package_names=test_packages)
    runner.run()
コード例 #3
0
def print_sample_with_prediction(result, package_name):
    idx = result['package_names'].index(package_name)

    model = pickle.load(open(config.TFIDFModels.description_model_2, 'rb'))

    print(SamplesDatabase.get().read(package_name, 'title'))
    print("  ", list(map(lambda v: "%d%%" % int(100 * v), result['metrics'][idx][1:])))

    top_actual_desc_indices = np.argsort(-np.array(result['y'][idx]))[:10].tolist()
    top_pred_desc_indices = np.argsort(-np.array(result['predictions'][idx]))[:10].tolist()

    tokens_actual = [model.get_feature_names()[i] for i in top_actual_desc_indices]
    values_actual = [result['y'][idx][i] for i in top_actual_desc_indices]

    tokens_predicted = [model.get_feature_names()[i] for i in top_pred_desc_indices]
    values_predicted = [result['predictions'][idx][i] for i in top_pred_desc_indices]

    print(" actual:")
    for token, value in zip(tokens_actual, values_actual):
        if value >= metrics._tfidf_discretize_threshold_descriptions:
            print("   %.2f  %s" % (value, token))
    print(" predicted:")
    for token, value in zip(tokens_predicted, values_predicted):
        if value >= metrics._tfidf_discretize_threshold_descriptions:
            print("   %.2f  %s" % (value, token))

    print()
コード例 #4
0
    def __init__(self,
                 packages,
                 shuffle=True,
                 at_once=False,
                 verbose=True,
                 batch_size=None):
        self.log = logging.getLogger()

        self.batch_size = batch_size or config.Text2PermissionClassifier.batch_size
        self.packages = packages

        self.at_once = at_once  # for validation: no batches, calculate at once

        self.permissions_parser = PermissionParser(mode='groups')
        self.num_permissions = self.permissions_parser.count()

        self.shuffle = shuffle
        self.indexes = []
        self.db = SamplesDatabase.get()

        if verbose:
            print("Generator loaded: %d files" % len(self.packages))

        self.embedded_samples = EmbeddedSamples.get()
        self.on_epoch_end()
コード例 #5
0
ファイル: report_saver.py プロジェクト: sg10/apk-verbalizer
 def set_app_info(self, package_name):
     db = SamplesDatabase.get()
     if package_name is None:
         raise RuntimeError("no package name!")
     app_name = db.read(package_name, 'title')
     self.package_name = package_name
     self.app_name = app_name
     self.version = ""
     self.app_icon = None
コード例 #6
0
ファイル: test_apk.py プロジェクト: sg10/apk-verbalizer
 def get_fallback_app_info(self):
     db = SamplesDatabase.get()
     if self.package_name is None:
         raise RuntimeError("no package name!")
     self.app_name = db.read(self.package_name, 'title')
     self.report_saver.package_name = self.package_name
     self.report_saver.app_name = self.app_name
     self.report_saver.version = ""
     self.report_saver.app_icon = None
コード例 #7
0
 def __init__(self, generator, print_per_label=2):
     super().__init__()
     self.generator = generator
     self.permission_parser = PermissionParser('groups')
     num_batches = min(config.Text2PermissionClassifier.batch_size//self.permission_parser.count(),
                       len(self.generator))
     self.batch_indices = np.random.choice(len(self.generator), num_batches, replace=False)
     self.threshold_true = 0.65
     self.print_per_label = print_per_label
     self.db = SamplesDatabase.get()
コード例 #8
0
def permission_class_weights(package_names):
    permission_parser = PermissionParser('groups')
    db = SamplesDatabase.get()

    count = np.zeros(shape=permission_parser.count())

    for p in package_names:
        count += permission_parser.transform(db.read(p, 'permissions'))

    weights = len(package_names)/(permission_parser.count() * count)
    weights = {i: w for i, w in enumerate(weights)}

    return weights
コード例 #9
0
def test_predictions_for_submodel(config_class, tfidf_data, tfidf_model, model_save_file):
    print("-" * 50)
    print("-    ", config_class.__name__)
    print("-" * 50)

    m.model_config_class = config_class

    db = SamplesDatabase.get()
    package_names = db.filter(('lang', '==', 'en'),
                              ('set', '==', 'test'),
                              ('description_raw', 'len>', 100),
                              ('cross_platform', '==', False))

    keras.backend.clear_session()

    test_generator = TFIDFGenerator(tfidf_input_model_file=tfidf_model,
                                    tfidf_input_data_file=tfidf_data,
                                    package_names=package_names,
                                    batch_size=128,
                                    verbose=True)

    model = model_for_tf_idfs(test_generator.get_num_inputs(),
                              test_generator.get_num_outputs())
    model.compile(optimizer=Adam(lr=config_class.learning_rate, amsgrad=True),
                  loss='mse',
                  metrics=get_metrics_for_a2p_model())

    model.load_weights(model_save_file)

    eval_values = model.evaluate_generator(test_generator, verbose=False)
    train_summary(eval_values, model, test_generator)

    result = {
        'X': [],
        'y': [],
        'package_names': [],
        'metrics': [],
        'predictions': []
    }

    for batch_index in range(len(test_generator)):
        X, y, package_names = test_generator.get_batch_and_details(batch_index, get_packages=True)
        metrics = [model.evaluate(X[i:i + 1], y[i:i + 1], verbose=False) for i in range(X.shape[0])]
        predictions = model.predict_on_batch(X)
        result['X'] += X.tolist()
        result['y'] += y.tolist()
        result['package_names'] += package_names
        result['metrics'] += metrics
        result['predictions'] += predictions.tolist()

    return result
コード例 #10
0
def test():
    from verifier.preprocessing.samples_database import SamplesDatabase
    e = PreTrainedEmbeddings.get()
    pt = e.get_delimiter_regex_pattern()

    db = SamplesDatabase.get()
    desc = db.read('com.whatsapp', 'description_raw')
    ts = re.split(pt, desc, flags=re.IGNORECASE)

    print(desc)
    print("|".join(ts))

    ids = e.tokens_to_indices(ts)
    ewords = [e.index2word(id) for id in ids]
    print(" ".join(ewords))
コード例 #11
0
ファイル: training.py プロジェクト: sg10/apk-verbalizer
def train(verbose=True, reduced_dataset=False):
    if not verbose:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2'
        logging.set_verbosity(logging.ERROR)

    db = SamplesDatabase.get()
    package_names = db.filter(
        ('lang', '==', 'en'), ('set', '==', 'train+valid'),
        ('description_raw', 'len>', 100), ('cross_platform', '==', False))
    if reduced_dataset:
        random.seed(42)
        random.shuffle(package_names)
        package_names = package_names[:11000]
        random.seed(None)
    else:
        random.shuffle(package_names)

    # package_names = package_names[:3000]

    if verbose:
        print("# apps for training (db): ", len(package_names))

    performance = []

    if config.TFIDFClassifier.ModelStrings.enabled:
        performance = train_tfidf_classifier(
            config.TFIDFClassifier.ModelStrings, package_names,
            config.TFIDFModels.code_stringres_data,
            config.TFIDFModels.code_stringres_model,
            config.TrainedModels.app2text_stringres, verbose)

    if config.TFIDFClassifier.ModelResourceIds.enabled:
        performance = train_tfidf_classifier(
            config.TFIDFClassifier.ModelResourceIds, package_names,
            config.TFIDFModels.code_ids_data,
            config.TFIDFModels.code_ids_model,
            config.TrainedModels.app2text_ids, verbose)

    if config.TFIDFClassifier.ModelMethods.enabled:
        performance = train_tfidf_classifier(
            config.TFIDFClassifier.ModelMethods, package_names,
            config.TFIDFModels.code_methods_data,
            config.TFIDFModels.code_methods_model,
            config.TrainedModels.app2text_methods, verbose)

    return performance
コード例 #12
0
def print_sample_stats():
    db = SamplesDatabase.get()

    total = len(db.read(None, "title"))
    crossplatform = len(db.filter(('cross_platform', '==', True)))
    en = len(db.filter(('lang', '==', 'en'), ('description_raw', 'len>', 30)))

    downloads = len(
        db.filter(
            ('downloads', '>=', config.Clustering.min_downloads_visualize)))

    print("# total = ", total)
    print("# cross platform = ", crossplatform, "   ",
          100 * crossplatform / total, "%")
    print("# en > 30 = ", en, "   ", 100 * en / total, "%")
    print("# downloads >= 4e6 = ", downloads, "   ", 100 * downloads / total,
          "%")
コード例 #13
0
ファイル: descriptions.py プロジェクト: sg10/apk-verbalizer
def create():
    # SamplesDatabase.set_file('samples_database.pk')

    db = SamplesDatabase.get()
    packages = db.filter(('lang', '==', 'en'))  # [:20000]

    print("creating model")

    min_df_pct = 0.002
    max_df_pct = 0.4

    min_df = int(len(packages) * min_df_pct)
    max_df = int(len(packages) * max_df_pct)

    UnStemmer.enabled = True

    tfidf_model = TfidfVectorizer(
        tokenizer=meta_data_description_tokenize,
        min_df=min_df,
        max_df=max_df,
        ngram_range=(1, 3),
        lowercase=False,  # done in tokenize function
        stop_words=get_stopwords_list())

    tfidf_model.fit(packages)

    u = UnStemmer.get()

    print("transforming data")

    tfidf_data = tfidf_model.transform(packages)

    print("saving model: ", config.TFIDFModels.description_model_2)
    tfidf_model.vocabulary_ = {
        " ".join(map(lambda x: u.resolve(x), k.split(" "))): v
        for k, v in tfidf_model.vocabulary_.items()
    }
    pickle.dump(tfidf_model, open(config.TFIDFModels.description_model_2,
                                  "wb"))
    print("saved")

    print("saving data: ", config.TFIDFModels.description_data_2)
    save_data = {'ids': packages, 'data': tfidf_data}
    pickle.dump(save_data, open(config.TFIDFModels.description_data_2, "wb"))
    print("saved")
コード例 #14
0
    def predict_batch(self):
        num_permissions = self.permission_parser.count()

        # minimum number of positive (=1) samples per label (permission) to show

        num_print_left = {i: self.print_per_label for i in range(num_permissions)}

        output_table = StringTable()
        labels = self.permission_parser.labels()
        output_table.set_headline(labels)

        for batch_idx in self.batch_indices:
            X, y, packages = self.generator.get_item_and_package(batch_idx)
            p = self.model.predict(X)

            for i, package in enumerate(packages):
                predicted = np.rint(p[i])
                real = np.rint(y[i])

                print_this_sample = False

                for j in range(num_permissions):
                    if real[j] == 1 and num_print_left[j] > 0:
                        num_print_left[j] -= 1
                        print_this_sample = True

                if print_this_sample:
                    evals = [" %.2f--%d %s" %
                             (p[i][k], real[k], "ok" if predicted[k] == real[k] == 1 else "")
                             for k in range(real.shape[0])]
                    output_table.add_cells(evals)

                    output_table.add_cell(package)
                    output_table.add_cell(self.db.read(package, 'title'))

                    raw_text = SamplesDatabase.get().read(package, 'description_raw')
                    output_table.add_cell(self.get_top_words_per_class(raw_text))

                    output_table.new_row()

        output_table.set_cell_length(-1, 1000)
        for row in output_table.create_table(return_rows=True):
            print(row)
コード例 #15
0
ファイル: descriptions.py プロジェクト: sg10/apk-verbalizer
def meta_data_description_tokenize(package_name):
    text = SamplesDatabase.get().read(package_name, 'description_raw')
    text = html.unescape(text)
    tokens = tokenize_text(text)
    return tokens
コード例 #16
0
    def process_a2t_input(self, input_mode):
        print(input_mode)

        if input_mode == 'methods':
            model.model_config_class = config.TFIDFClassifier.ModelMethods
            tfidf_model_file = config.TFIDFModels.code_methods_model
            tfidf_data_file = config.TFIDFModels.code_methods_data
            ml_model_file = config.TrainedModels.app2text_methods
        elif input_mode == 'ids':
            model.model_config_class = config.TFIDFClassifier.ModelResourceIds
            tfidf_model_file = config.TFIDFModels.code_ids_model
            tfidf_data_file = config.TFIDFModels.code_ids_data
            ml_model_file = config.TrainedModels.app2text_ids
        elif input_mode == 'strings':
            model.model_config_class = config.TFIDFClassifier.ModelStrings
            tfidf_model_file = config.TFIDFModels.code_stringres_model
            tfidf_data_file = config.TFIDFModels.code_stringres_data
            ml_model_file = config.TrainedModels.app2text_stringres
        else:
            raise RuntimeError("unknown input mode")

        test_generator = TFIDFGenerator(
            tfidf_input_data_file=tfidf_data_file,
            tfidf_input_model_file=tfidf_model_file,
            package_names=self.package_names,
            batch_size=1,
            shuffle=False,
            verbose=True)

        print("loading ML model")

        K.backend.clear_session()

        ml_model = model_for_tf_idfs(test_generator.get_num_inputs(),
                                     test_generator.get_num_outputs())

        ml_model.compile(optimizer='adam', loss='mse', metrics=['mse'])

        ml_model.load_weights(ml_model_file)

        # tensorflow.get_default_graph().finalize()

        print("predicting")

        y = ml_model.predict_generator(test_generator)

        for i in range(y.shape[0]):
            package_name = test_generator.package_names[i]

            descriptions_tfidf_model = test_generator.descriptions['model']
            input_tfidf_model = test_generator.inputs
            input_tfidf_vector = test_generator.inputs.tfidf_data[
                test_generator.inputs.doc_ids.index(package_name)]

            output_words = get_top_terms(y[i],
                                         descriptions_tfidf_model,
                                         top_k=self.n_top_terms)
            words_pred = {word: float(value) for word, value in output_words}
            descriptions_tfidf_indices = np.argsort(
                -y[i]).flatten().tolist()[:self.n_top_terms]

            influences = get_input_influences(input_tfidf_model,
                                              input_tfidf_vector,
                                              descriptions_tfidf_model,
                                              descriptions_tfidf_indices,
                                              ml_model)

            progress_pct = "%3d%%    " % ((100 * (i + 1)) // y.shape[0])
            print("-" * 80)
            print(progress_pct, package_name)
            print("-" * 80)
            if len(influences) > 0:
                for word, score in output_words[:7]:
                    print("[%.3f]    %s" % (score, word))
                    if not word in influences:
                        continue
                    for influence, score2 in influences[word][:7]:
                        print("        [%.5f]    %s" % (score2, influence))
            print()

            report_saver = ReportSaver(report_folder=self.report_folder)
            report_saver.set_app_info(package_name)

            report_saver.a2t['words_pred'] = report_saver.a2t.get(
                'words_pred', {})
            report_saver.a2t['words_pred'][input_mode] = words_pred
            report_saver.a2t['input_values'] = report_saver.a2t.get(
                'input_values', {})
            report_saver.a2t['input_values'][input_mode] = influences
            report_saver.a2t['text_actual'] = SamplesDatabase.get().read(
                package_name, "description_raw")

            report_saver.save()

        K.backend.clear_session()
コード例 #17
0
ファイル: test_apk.py プロジェクト: sg10/apk-verbalizer
 def get_fallback_description(self):
     self.text = SamplesDatabase.get().read(self.package_name,
                                            'description_raw')
コード例 #18
0
def train(verbose=True, all_folds=False):

    if not verbose:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2'
        logging.set_verbosity(logging.ERROR)

    description_embedding_tokens_type = "description_num_tokens_%s" % get_t2p_word_embedding_type()

    db = SamplesDatabase.get()
    package_names = db.filter(#('lang', '==', 'en'),
                              ('set', '==', 'train+valid'),
                              (description_embedding_tokens_type, '>=', 20))
    random.shuffle(package_names)

    if verbose:
        print("packages from db with criteria: ", len(package_names))

    k_fold_splitter = KFold(n_splits=config.Text2PermissionClassifier.validation_split)

    k_reports_valid = []
    k_reports_test = []

    package_names_test = db.filter(('lang', '==', 'en'),
                                   ('set', '==', 'test'),
                                   (description_embedding_tokens_type, '>=', 20))
    random.shuffle(package_names_test)
    test_generator = Generator(packages=package_names_test, batch_size=128, verbose=False)

    keras.backend.clear_session()

    model = model_multiconv_1d(PermissionParser(mode='groups').count())

    for fold_number, (train_index, valid_index) in enumerate(k_fold_splitter.split(package_names)):
        print("FOLD:                 ", fold_number+1)

        packages_train = np.array(package_names)[train_index].tolist()
        packages_valid = np.array(package_names)[valid_index].tolist()

        model.compile(loss="binary_crossentropy",
                      optimizer=Adam(0.0001),
                      metrics=[metrics.fb_micro, metrics.fb_macro, metrics.precision, metrics.recall])
        train_metric = 'val_fb_macro'

        # keras.utils.plot_model(model, "model.png", show_shapes=True)
        if verbose and fold_number == 0:
            model.summary()

        train_generator = Generator(packages=packages_train, verbose=verbose)
        valid_generator = Generator(packages=packages_valid, batch_size=128, verbose=verbose)

        callbacks = [
            keras.callbacks.EarlyStopping(monitor=train_metric,
                                          mode='max',
                                          min_delta=config.Text2PermissionClassifier.early_stopping_delta,
                                          patience=config.Text2PermissionClassifier.early_stopping_patience,
                                          verbose=verbose),
            keras.callbacks.ModelCheckpoint(filepath=config.TrainedModels.text2permission,
                                            monitor=train_metric,
                                            mode='max',
                                            save_best_only=True,
                                            verbose=verbose)
        ]

        #if verbose:
        #    callbacks.append(PrintSamples(valid_generator, print_per_label=3))
        #    callbacks.append(PrintPerClassMetrics(valid_generator))

        model.fit_generator(train_generator,
                            epochs=config.Text2PermissionClassifier.max_train_epochs,
                            shuffle=True,
                            class_weight=permission_class_weights(packages_train),
                            validation_data=valid_generator,
                            use_multiprocessing=False,
                            verbose=verbose,
                            callbacks=callbacks
                            )

        model.load_weights(config.TrainedModels.text2permission)

        if verbose:
            print("-" * 80)
            print("-  done!")
            print("-" * 80)

            print("--- VALIDATION")

        print_metrics = PrintPerClassMetrics(valid_generator)
        print_metrics.model = model
        report_valid = print_metrics.predict_batch(print_report=verbose)

        if verbose:
            print("--- TEST")

        print_metrics = PrintPerClassMetrics(test_generator)
        print_metrics.model = model
        report_test = print_metrics.predict_batch(print_report=verbose)

        #if verbose:
        #    print_samples = PrintSamples(test_generator)
        #    print_samples.model = model
        #    print_samples.predict_batch()

        if not all_folds:
            return report_valid, report_test
        else:
            k_reports_valid.append(report_valid)
            k_reports_test.append(report_test)

    del model

    avg_reports_valid = {}
    avg_reports_test = {}

    for row in k_reports_valid[0].keys():
        for col in list(k_reports_valid[0].values())[0].keys():
            avg_reports_valid[row] = avg_reports_valid.get(row, {})
            avg_reports_valid[row][col] = mean([k_reports_valid[r].get(row, {}).get(col, 0.) or 0. for r in range(len(k_reports_valid))])
            avg_reports_test[row] = avg_reports_test.get(row, {})
            avg_reports_test[row][col] = mean([k_reports_test[r].get(row, {}).get(col, 0.) or 0. for r in range(len(k_reports_test))])

    if verbose:
        print("*" * 50)
        print(" - average over all %d folds" % k_fold_splitter.n_splits)
        print("*" * 50)
        print("VALIDATION")
        print_class_report_fbeta(avg_reports_valid)
        print()
        print("TEST")
        print_class_report_fbeta(avg_reports_test)

    return avg_reports_valid, avg_reports_test
コード例 #19
0
    def __init__(self):
        self.db = SamplesDatabase.get()
        self.load()
        self.checked_if_up_to_date = False

        self.num_added = 0
コード例 #20
0
 def __init__(self):
     self.descriptions_limed = None
     self.db = SamplesDatabase.get()