Esempio n. 1
0
def load_LSH(lib_profiles,
             mode=MODE.SCALABLE,
             repackage=False,
             processes=None):
    """Load library profiles to an LSH object.
    
    Args:
        lib_profiles (list): The list of library profiles.
        mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details.
        repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. 
        processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used.
    """

    global LSH, LIB_RELATIONSHIP_GRAPHS

    weights = (0.5, 0.5) if repackage else (0.1, 0.9)
    LSH = MinHashLSHEnsemble(threshold=LSH_THRESHOLD,
                             num_perm=LSH_PERM_NUM,
                             num_part=32,
                             weights=weights)

    (minhash_list,
     LIB_RELATIONSHIP_GRAPHS) = profiler.parallel_load_libs_profile(
         lib_profiles=lib_profiles,
         mode=mode,
         repackage=repackage,
         processes=processes)

    LOGGER.info("Start indexing LSH (this could take a while) ...")

    start_time = time.time()
    LSH.index(minhash_list)
    end_time = time.time()

    LOGGER.info("LSH indexed. Duration: %fs", end_time - start_time)
Esempio n. 2
0
def parallel_profiling_binaries(paths,
                                output_folder,
                                profile_type,
                                processes=1,
                                overwrite=False):
    """Profiling Android app/library binaries to JSON files.
    
    Args:
        paths (list): The list of binaries.
        output_folder (str): The folder to store profiles.
        profile_type (str): Either 'app' or 'lib'.
        processes (int, optional): Defaults to 1. The number of processes to use.
        overwrite (bool, optional): Defaults to False. Should LibID overwrite the binary profile if it exists?
    """

    start_time = time.time()

    if processes == 1:
        failed_binaries = map(
            _profiling_binary,
            izip(paths, repeat(output_folder), repeat(profile_type),
                 repeat(overwrite)))
    else:
        pool = Pool(processes=processes)
        failed_binaries = pool.map(
            _profiling_binary,
            izip(paths, repeat(output_folder), repeat(profile_type),
                 repeat(overwrite)))

    end_time = time.time()

    failed_binaries = [b for b in failed_binaries if b]

    LOGGER.info("Profiling time: %fs", end_time - start_time)
    LOGGER.info("Failed binaries: %s", failed_binaries)
Esempio n. 3
0
def _export_result_to_json(analyzer, output_path, start_time):
    json_info = analyzer.get_matched_libs_json_info()

    end_time = time.time()
    json_info["time"] = end_time - start_time
    profiler.write_to_json(output_path, json_info)

    LOGGER.info("The result of %s is stored at %s", path.basename(output_path),
                output_path)
Esempio n. 4
0
    def _is_pmatch_reach_threshold(self, lib):
        lib_signature_num = int(lib.split("|")[3])

        LOGGER.debug("shrink percentage (before matching): %s, %f", lib, self._get_shrink_percentage(
            self._pmatch_app_classes[lib], lib_signature_num))

        if self._get_shrink_percentage(self._pmatch_app_classes[lib], lib_signature_num) < self.shrink_threshold:
            return False

        return True
Esempio n. 5
0
    def __init__(self, file_path):

        self.a = None
        self.d = []
        self.dx = []

        self.classes_names = []
        self._class_dex = dict()

        self._classes_signatures = dict()
        self._classes_xref_tos = dict()
        self._classes_interfaces = dict()
        self._classes_superclass = dict()

        self.LIB_RELATIONSHIP_GRAPHS = dict()

        LOGGER.info("Start loading %s ...", os.path.basename(file_path))
        self._load_file(file_path)
        LOGGER.info("%s loaded", os.path.basename(file_path))

        # packages = [1st_level_package, ..., last_level_dir]
        # package_contents[package] = [package_contents]
        # Example:
        # packages = [("Landroid"), ..., ("Landroid/support/v4/app/demo")]
        # package_contents["Landroid/support"] = ["Landroid/support/v4",
        # "Landroid/support/v6"]
        self._package_classes = dict()
        self._signature_weight = dict()

        self._call_graph = None
        self._interface_graph = None
        self._superclass_graph = None

        # Classes that been called but not exist in the package
        self._ghost_graph = nx.MultiDiGraph()

        # Library identification related variables
        self._libs_matches = dict()
        self._package_libs_matches = dict()
        self._class_libs_matches = dict()
        self._lib_packages_matches = dict()
        self._lib_info = dict()
        self._lib_shrink_percentage = dict()
        self._lsh_classes = set()

        self._pmatch_app_classes = dict()
        self._pmatch_lib_classes = dict()
        self._pmatch_lib_app_classes = dict()

        self.mode = None
        self.consider_classes_repackaging = True
        self.shrink_threshold = None
        self.similarity_threshold = None
Esempio n. 6
0
def profile_binaries(base_path=None,
                     file_paths=None,
                     output_folder='profiles',
                     processes=None,
                     overwrite=False):
    """Profile app/library binaries to JSON files.

    Must provide either `base_path` or `file_paths`. 

    Args:
        base_path (str, optional): Defaults to None. The folder that contains app/library binaries.
        file_paths (list, optional): Defaults to None. The list of app/library binaries.
        output_folder (str, optional): Defaults to 'profiles'. The folder to store profiles.
        processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used.
        overwrite (bool, optional): Defaults to False. Should LibID overwrite the output file if it exists?
    """
    apk_files = []
    dex_files = []
    jar_files = []
    aar_files = []
    if not file_paths:
        if base_path:
            apk_files = glob2.glob(path.join(base_path, "**/*.apk"))
            dex_files = glob2.glob(path.join(base_path, "**/*.dex"))
            jar_files = glob2.glob(path.join(base_path, "**/*.jar"))
            aar_files = glob2.glob(path.join(base_path, "**/*.aar"))
        else:
            LOGGER.error("No valid folder or file path provided.")
    else:
        for f in file_paths:
            if f[-4:] == '.apk':
                apk_files.append(f)
            elif f[-4:] == '.dex':
                dex_files.append(f)
            elif f[-4:] == '.jar':
                jar_files.append(f)
            elif f[-4:] == '.aar':
                aar_files.append(f)
            else:
                LOGGER.error("Invalid file format {}".format(f))

    _profile_apps(
        apk_files,
        output_folder=output_folder,
        processes=processes,
        overwrite=overwrite)
    _profile_libs(
        dex_files,
        jar_files,
        aar_files,
        output_folder=output_folder,
        processes=processes,
        overwrite=overwrite)
Esempio n. 7
0
    def _check_if_library_match(self, lib, lib_name, class_num, signature_num, assume_flattened_package=False):
        start_time = time.time()
        weight, matched_classes_pairs = self._match_relationship_graph_for_lib(
            lib, lib_name, int(class_num), assume_flattened_package)
        end_time = time.time()

        LOGGER.debug("graph matching time: %fs",
                            end_time - start_time)

        matched_app_classes = set(pair[1]
                                    for pair in matched_classes_pairs)

        shrink_percentage = self._get_shrink_percentage(
            matched_app_classes, signature_num)

        LOGGER.debug("matched weight: %f", weight)
        LOGGER.debug("shrink percentage: %f", shrink_percentage)
        LOGGER.debug("matched classes pairs: %s", matched_classes_pairs)

        if shrink_percentage > self.shrink_threshold:
            matched_root_package = self._get_root_package(
                matched_app_classes)
            return self._check_package_lib_match(
                lib_name, matched_root_package, matched_classes_pairs, int(class_num), int(signature_num))
        else:
            return False
Esempio n. 8
0
    def _get_lib_match_similarity(self, matched_classes_pairs, lib_name, lib_class_num, lib_signature_num):

        LOGGER.debug("matched_app_classes: %d",
                          len(matched_classes_pairs))

        LOGGER.debug(matched_classes_pairs)
        
        matched_app_classes, package_classes = self._get_package_classes_within_call_graph(
            matched_classes_pairs, lib_name)

        shrink_percentage = self._get_shrink_percentage(
            matched_app_classes, lib_signature_num)
        
        LOGGER.debug("shrink percentage (after): %f", shrink_percentage)

        if shrink_percentage < self.shrink_threshold:
            return 0

        package_classes = [
            c for c in package_classes if self._classes_signatures[c]]

        divide_classes_num = min(len(package_classes), lib_class_num)

        similarity = len(matched_app_classes) / \
            float(divide_classes_num) if divide_classes_num else 0
        self._lib_shrink_percentage[lib_name] = self._get_shrink_percentage(
            package_classes, lib_signature_num)

        LOGGER.debug("matching info: %s -> %s: %d, %d, %d, %f", self.filename, lib_name, len(matched_app_classes), lib_class_num, len(package_classes), similarity)
        FILE_LOGGER.debug("%s -> %s: %d, %d, %d, %f", self.filename, lib_name, len(
            matched_app_classes), lib_class_num, len(package_classes), similarity)

        return similarity
Esempio n. 9
0
    def _get_raw_classes_matches(self, lsh, exclude_builtin):
        start_time = time.time()
        LOGGER.info("Start matching classes ...")

        for class_name in tqdm(self.classes_names):
            # Exclude builtin libraries can speed up the matching
            if exclude_builtin and class_name.startswith(("Landroid/support", "Lcom/google/android/gms")):
                self._class_libs_matches[class_name] = set()
            else:
                matches = self._get_raw_class_matches(class_name, lsh)
                self._class_libs_matches[class_name] = matches

        end_time = time.time()

        LOGGER.info("Classes matching finished. Duration: %fs", end_time - start_time)
Esempio n. 10
0
def _profiling_binary(profiling_info):
    (file_path, output_dir, profile_type, overwrite) = profiling_info
    name = path.splitext(path.basename(file_path))[0] + ".json"

    json_file_path = path.join(output_dir, profile_type, name)
    if overwrite or not path.exists(json_file_path):
        try:
            analyzer = LibAnalyzer(file_path)

            json_info = analyzer.get_classes_signatures_json_info(
            ) if profile_type == "app" else analyzer.get_lib_classes_signatures_json_info(
            )
            write_to_json(json_file_path, json_info)
            LOGGER.info("The binary profile is stored at %s", json_file_path)
        except Exception, e:
            LOGGER.error("error: %s", e)
            return file_path
Esempio n. 11
0
def parallel_load_libs_profile(lib_profiles,
                               mode=MODE.SCALABLE,
                               repackage=False,
                               processes=1):
    """Loading library profiles as a MinHash list and relation graphs.
    
    Args:
        lib_profiles (list): The list of library profiles.
        mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details.
        repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. 
        processes (int, optional): Defaults to 1. The number of processes to use.
    
    Returns:
        tuple: (the minhash list, the relation graph dictionary)
    """

    LOGGER.info("Loading %d library profiles ...", len(lib_profiles))

    start_time = time.time()

    if processes == 1:
        results = map(_load_lib_profile,
                      izip(lib_profiles, repeat(mode), repeat(repackage)))
    else:
        pool = Pool(processes=processes)
        results = pool.map(_load_lib_profile,
                           izip(lib_profiles, repeat(mode), repeat(repackage)))

    end_time = time.time()

    LOGGER.info("Library profiles loaded. Duration: %fs",
                end_time - start_time)

    minhash_list = []
    lib_relationship_graphs_dict = dict()

    for result in results:
        minhash_list += result[1]
        if mode == MODE.ACCURATE:
            lib_relationship_graphs_dict[result[0]] = result[2]

    return (minhash_list, lib_relationship_graphs_dict)
Esempio n. 12
0
def _search_libs_in_app(profile_n_mode_n_output_n_repackage_n_exclude):
    global LSH

    (app_profile, mode, output_folder, repackage,
     exclude_builtin) = profile_n_mode_n_output_n_repackage_n_exclude

    output_path = _get_output_path(app_profile, output_folder)

    try:
        start_time = time.time()
        analyzer = LibAnalyzer(app_profile)
        analyzer.get_libraries(LSH,
                               mode=mode,
                               repackage=repackage,
                               LIB_RELATIONSHIP_GRAPHS=LIB_RELATIONSHIP_GRAPHS,
                               exclude_builtin=exclude_builtin)

        _export_result_to_json(analyzer, output_path, start_time)
    except Exception:
        LOGGER.exception("%s failed", app_profile)
Esempio n. 13
0
def _profile_libs(dex_files,
                  jar_files,
                  output_folder="profiles",
                  processes=None,
                  overwrite=False):
    # Convert jar file to dex file
    for f in jar_files:
        dex_file_path = path.join(path.dirname(f),
                                  path.basename(f)[:-4] + ".dex")

        if not path.exists(dex_file_path):
            LOGGER.info("Converting %s to %s ...", path.basename(f),
                        path.basename(dex_file_path))
            cmd = "{} -o {} {}".format(DEX2JAR_PATH, dex_file_path, f)

            try:
                subprocess.check_output(cmd, shell=True)
                LOGGER.info("Converted")

                dex_files.append(dex_file_path)
            except:
                LOGGER.error("Conversion failed")
                continue

    if dex_files:
        profiler.parallel_profiling_binaries(dex_files,
                                             output_folder,
                                             "lib",
                                             processes=processes,
                                             overwrite=overwrite)
Esempio n. 14
0
    def _match_relationship_graph_for_lib(self, lib, lib_name, lib_class_num, assume_flattened_package=False):
        LOGGER.debug("lib_name: %s", lib_name)

        lib_class_names = set(self._pmatch_lib_classes[lib])
        app_class_names = set(self._pmatch_app_classes[lib])
        potential_class_matches = set(self._pmatch_lib_app_classes[lib])

        lib_method_calls, lib_interfaces, lib_superclasses = self._get_relationship_between_classes(
            lib_class_names, lib_name)
        app_method_calls, app_interfaces, app_superclasses = self._get_relationship_between_classes(
            app_class_names)

        app_class_weights = dict()
        for class_name in app_class_names:
            app_class_weights[class_name] = 1.0 / lib_class_num + \
                0.0001 * len(self._classes_signatures[class_name])

        childless_packages = set()
        if self.consider_classes_repackaging:
            childless_packages = set(os.path.dirname(
                cn) for cn in app_class_names if not self._check_package_has_subpackage(os.path.dirname(cn)))

        LOGGER.debug("potential matches: %d, lib calls: %d, method_calls: %d", len(
            potential_class_matches), len(lib_method_calls), len(app_method_calls))
        
        return match(lib_classnames=lib_class_names,
                     app_classnames=app_class_names,
                     potential_class_matches=potential_class_matches,
                     lib_method_calls=lib_method_calls,
                     app_method_calls=app_method_calls,
                     app_class_weights=app_class_weights,
                     lib_class_parents=lib_superclasses,
                     app_class_parents=app_superclasses,
                     lib_class_interfaces=lib_interfaces,
                     app_class_interfaces=app_interfaces,
                     use_pkg_hierarchy=not self.consider_classes_repackaging,
                     assume_flattened_package=assume_flattened_package,
                     flattened_app_pkgs_allowed=childless_packages)
Esempio n. 15
0
    def get_formatted_method_descriptor(self, encoded_method, class_descriptor, method_descriptor=None):
        """Replace all obfuscatable names with X
        
        Args:
            encoded_method (dvm.EncodedMethod): The encoded method parsed by Androidguard.
            class_descriptor (str): The class descriptor.
            method_descriptor (str, optional): Defaults to None. The method descriptor.
        
        Returns:
            str: Formatted method descriptor.
        """
        descriptor = method_descriptor if method_descriptor else encoded_method.get_descriptor()
        LOGGER.debug("descriptor: %s", descriptor)
        splits = re.split(r"\(|\)", descriptor)
        input_types = splits[1].split(' ')
        return_types = splits[2].split(' ')
        types = filter(None, set(input_types).union(return_types))

        for _type in types:
            if _type[-1] == ";" and _type not in config.ANDROID_SDK_CLASSES:
                descriptor = descriptor.replace(_type, "X")

        return "%s%s" % (class_descriptor, descriptor)
Esempio n. 16
0
    def _check_package_lib_match(self, lib_name, package, matched_classes_pairs, lib_class_num, lib_signature_num):
        similarity = self._get_lib_match_similarity(
            matched_classes_pairs, lib_name, lib_class_num, lib_signature_num)

        LOGGER.debug("similarity: %s : %f", lib_name, similarity)

        if similarity > self.similarity_threshold:
            lib_name_base = lib_name.split("_")[0] + "_"

            # If there are libraries already matched to the package
            if package in self._package_libs_matches:
                existed_lib = [
                    lib for lib in self._package_libs_matches[package] if lib.startswith(lib_name_base)]
                # If libraries with the same name have matched to the package
                if existed_lib:
                    if abs(similarity - self._libs_matches[existed_lib[0]]) < 0.0001:
                        self._bind_lib_to_package(
                            lib_name, similarity, package)
                    elif similarity > self._libs_matches[existed_lib[0]]:
                        for lib in existed_lib:
                            del self._libs_matches[lib]
                            for _package in self._lib_packages_matches[lib]:
                                self._package_libs_matches[_package].remove(
                                    lib)
                            del self._lib_packages_matches[lib]

                        self._bind_lib_to_package(
                            lib_name, similarity, package)

                else:
                    self._bind_lib_to_package(lib_name, similarity, package)
            else:
                self._bind_lib_to_package(lib_name, similarity, package)

            return True

        return False
Esempio n. 17
0
    def _match_libraries(self):
        self._get_possible_matches()

        library_matching_start = time.time()
        LOGGER.info("Start matching libraries ...")
        for lib in tqdm(self._pmatch_app_classes):
            # lib = "lib_name|root_package|class_num|sig_num|category|"
            [lib_name, root_package, class_num,
                signature_num, category, _] = lib.split("|")
            self._lib_info[lib_name] = [root_package, category]

            is_match = self._check_if_library_match(lib, lib_name, class_num, signature_num)
            
            if not is_match and self.consider_classes_repackaging:
                LOGGER.debug("Try matching considering class repackaging")
                LOGGER.debug("---------------------------------------------------")
                self._check_if_library_match(lib, lib_name, class_num, signature_num, True)

        library_matching_end = time.time()

        LOGGER.info("Libraries matching finished. Duration: %fs", library_matching_end - library_matching_start)
Esempio n. 18
0
def match(lib_classnames,
          app_classnames,
          potential_class_matches,
          lib_method_calls,
          app_method_calls,
          app_class_weights,
          lib_class_parents=None,
          app_class_parents=None,
          lib_class_interfaces=None,
          app_class_interfaces=None,
          use_pkg_hierarchy=True,
          assume_flattened_package=False,
          flattened_app_pkgs_allowed=None,
          use_call_graph_constraints=True):

    m = Model("")

    # If the log level is DEBUG
    if LOGGER.getEffectiveLevel() == 10:
        LOGGER.debug('%d lib classes, %d app classes', len(lib_classnames),
                     len(app_classnames))
        LOGGER.debug('%d lib methods, %d app methods', len(lib_method_calls),
                     len(app_method_calls))
    else:
        m.setParam('OutputFlag', False)

    class_match_vars = {}
    lib_class_match_count_exprs = {}
    app_class_match_count_exprs = {}
    for pcm in potential_class_matches:
        class_match_vars[pcm] = m.addVar(vtype=GRB.BINARY)
        (lib_class, app_class) = pcm

        if lib_class not in lib_class_match_count_exprs:
            lib_class_match_count_exprs[lib_class] = LinExpr(0)
        lib_class_match_count_exprs[lib_class] += class_match_vars[pcm]

        if app_class not in app_class_match_count_exprs:
            app_class_match_count_exprs[app_class] = LinExpr(0)
        app_class_match_count_exprs[app_class] += class_match_vars[pcm]

    for expr in lib_class_match_count_exprs.itervalues():
        m.addConstr(expr <= 1)

    for expr in app_class_match_count_exprs.itervalues():
        m.addConstr(expr <= 1)

    app_class_used_vars = {}
    for app_class in app_classnames:
        app_class_used_vars[app_class] = m.addVar(vtype=GRB.BINARY)
        if app_class in app_class_match_count_exprs:
            m.addConstr(app_class_used_vars[app_class] ==
                        app_class_match_count_exprs[app_class])
        else:
            m.addConstr(app_class_used_vars[app_class] == 0)

    LOGGER.debug('Method matching...')

    methods_matched_total_expr = LinExpr(0)
    if use_call_graph_constraints:
        method_matching_candidates = [
            cand for cand in get_method_matching_candidates(
                lib_method_calls, app_method_calls)
        ]

        lib_method_match_count_exprs = {}
        app_method_match_count_exprs = {}

        for lib_method_call in lib_method_calls:
            lib_method_match_count_exprs[lib_method_call] = LinExpr(0)

        for app_method_call in app_method_calls:
            app_method_match_count_exprs[app_method_call] = LinExpr(0)

        method_matching_vars = {}
        for mm in method_matching_candidates:
            lib_method_call = mm[0]
            app_method_call = mm[1]
            lib_app_class1 = (lib_method_call.class1, app_method_call.class1)
            lib_app_class2 = (lib_method_call.class2, app_method_call.class2)
            if lib_app_class1 in class_match_vars and lib_app_class2 in class_match_vars:
                method_matching_vars[mm] = m.addVar(vtype=GRB.BINARY)
                m.addConstr(method_matching_vars[mm] <=
                            class_match_vars[lib_app_class1])
                m.addConstr(method_matching_vars[mm] <=
                            class_match_vars[lib_app_class2])

                lib_method_match_count_exprs[
                    lib_method_call] += method_matching_vars[mm]
                app_method_match_count_exprs[
                    app_method_call] += method_matching_vars[mm]

                methods_matched_total_expr += 1 * method_matching_vars[mm]

        LOGGER.debug('Done')

        for expr in lib_method_match_count_exprs.itervalues():
            m.addConstr(expr <= 1)

        for app_method_call, expr in app_method_match_count_exprs.iteritems():
            app_method_class1 = app_method_call.class1
            app_method_class2 = app_method_call.class2
            tmp = m.addVar(vtype=GRB.BINARY)
            m.addConstr(tmp == and_(app_class_used_vars[app_method_class1],
                                    app_class_used_vars[app_method_class2]))
            m.addConstr(expr == tmp)

    if use_pkg_hierarchy:
        lib_pkg_parent_dict = {}
        lib_class_pkg_dict = {}
        app_pkg_parent_dict = {}
        app_class_pkg_dict = {}
        process_class_hierarchy(lib_classnames, lib_pkg_parent_dict,
                                lib_class_pkg_dict, ROOT_PKG)
        process_class_hierarchy(app_classnames, app_pkg_parent_dict,
                                app_class_pkg_dict, ROOT_PKG)

        LOGGER.debug(lib_pkg_parent_dict)
        LOGGER.debug(app_pkg_parent_dict)
        LOGGER.debug(lib_class_pkg_dict)
        LOGGER.debug(app_class_pkg_dict)

        lib_pkg_match_cnt_exprs = {}
        app_pkg_match_cnt_exprs = {}

        all_lib_pkgs = list(lib_pkg_parent_dict.keys()) + [ROOT_PKG]
        all_app_pkgs = list(app_pkg_parent_dict.keys()) + [ROOT_PKG]

        LOGGER.debug('All lib packages: %s', all_lib_pkgs)
        LOGGER.debug('All app packages: %s', all_app_pkgs)

        potential_package_matches = list(
            itertools.product(all_lib_pkgs, all_app_pkgs))

        package_matches_vars = {}
        for (lib_pkg, app_pkg) in potential_package_matches:
            match_var = m.addVar(vtype=GRB.BINARY,
                                 name=('%s/%s' % (lib_pkg, app_pkg)))
            package_matches_vars[(lib_pkg, app_pkg)] = match_var

            if lib_pkg not in lib_pkg_match_cnt_exprs:
                lib_pkg_match_cnt_exprs[lib_pkg] = LinExpr(0)
            lib_pkg_match_cnt_exprs[lib_pkg] += match_var

            if app_pkg not in app_pkg_match_cnt_exprs:
                app_pkg_match_cnt_exprs[app_pkg] = LinExpr(0)
            app_pkg_match_cnt_exprs[app_pkg] += match_var

        # Every lib package can be matched to at most one app package
        for expr in lib_pkg_match_cnt_exprs.itervalues():
            m.addConstr(expr <= 1)

        # Every app package can be matched to at most one lib package
        for expr in app_pkg_match_cnt_exprs.itervalues():
            m.addConstr(expr <= 1)

        # Packages can only match if their parent packages match too
        for (lib_pkg, app_pkg) in potential_package_matches:
            if lib_pkg == ROOT_PKG or app_pkg == ROOT_PKG:
                continue
            lib_parent_pkg = lib_pkg_parent_dict[lib_pkg]
            app_parent_pkg = app_pkg_parent_dict[app_pkg]
            match_var = package_matches_vars[(lib_pkg, app_pkg)]
            if (lib_parent_pkg, app_parent_pkg) in package_matches_vars:
                parent_match_var = package_matches_vars[(lib_parent_pkg,
                                                         app_parent_pkg)]
                m.addConstr(match_var <= parent_match_var)
            else:
                m.addConstr(match_var == 0)

        # Classes can only match if their packages also match
        for pcm in potential_class_matches:
            (lib_class, app_class) = pcm
            lib_class_pkg = lib_class_pkg_dict[lib_class]
            app_class_pkg = app_class_pkg_dict[app_class]
            ppm = (lib_class_pkg, app_class_pkg)

            if ppm in potential_package_matches:
                m.addConstr(class_match_vars[pcm] <= package_matches_vars[ppm])
            else:
                m.addConstr(class_match_vars[pcm] == 0)

    elif assume_flattened_package:

        app_pkg_parent_dict = {}
        app_class_pkg_dict = {}
        process_class_hierarchy(app_classnames, app_pkg_parent_dict,
                                app_class_pkg_dict, ROOT_PKG)

        app_pkg_active_vars = {}
        active_pkgs_cnt_expr = LinExpr(0)

        if flattened_app_pkgs_allowed is None:
            flattened_app_pkgs_allowed = app_pkg_parent_dict.keys()
        else:
            flattened_app_pkgs_allowed = [
                '/' + pkg for pkg in flattened_app_pkgs_allowed
            ]

        for pkg in flattened_app_pkgs_allowed:
            app_pkg_active_vars[pkg] = m.addVar(vtype=GRB.BINARY,
                                                name=('%s' % pkg))
            active_pkgs_cnt_expr += app_pkg_active_vars[pkg]

        m.addConstr(active_pkgs_cnt_expr <= 1)

        for pcm in potential_class_matches:
            (lib_class, app_class) = pcm
            app_class_pkg = app_class_pkg_dict[app_class]

            if app_class_pkg in app_pkg_active_vars:
                m.addConstr(
                    class_match_vars[pcm] <= app_pkg_active_vars[app_class_pkg]
                )
            else:
                m.addConstr(class_match_vars[pcm] == 0)

    app_parents_and_interf_matched_expr = LinExpr(0)

    # Superclass matching
    if lib_class_parents:
        for pcm in potential_class_matches:
            (lib_class, app_class) = pcm
            parent_lib = lib_class_parents[
                lib_class] if lib_class in lib_class_parents else None
            parent_app = app_class_parents[
                app_class] if app_class in app_class_parents else None
            if parent_lib:
                if parent_app:
                    parents_match = (parent_lib, parent_app)
                    if parents_match in class_match_vars.keys():
                        if not assume_flattened_package or (
                                basename(lib_class) == basename(parent_lib) and
                                basename(app_class) == basename(parent_app)):
                            m.addConstr(class_match_vars[pcm] <=
                                        class_match_vars[parents_match])
                    else:
                        m.addConstr(class_match_vars[pcm] == 0)
                else:
                    m.addConstr(class_match_vars[pcm] == 0)
            else:
                if parent_app:
                    m.addConstr(1 - class_match_vars[pcm] >=
                                app_class_match_count_exprs[parent_app])

        for app_class, app_class_parent in app_class_parents.iteritems():
            if app_class in app_class_used_vars and app_class_parent in app_class_used_vars:
                app_class_and_parent_matched = m.addVar(vtype=GRB.BINARY)
                m.addConstr(app_class_used_vars[app_class] >=
                            app_class_and_parent_matched)
                m.addConstr(app_class_used_vars[app_class_parent] >=
                            app_class_and_parent_matched)
                app_parents_and_interf_matched_expr += app_class_and_parent_matched

    # Interface matching
    if lib_class_interfaces:
        for pcm in potential_class_matches:
            (lib_class, app_class) = pcm
            interfaces_lib_class = lib_class_interfaces[
                lib_class] if lib_class in lib_class_interfaces else []
            interfaces_app_class = app_class_interfaces[
                app_class] if app_class in app_class_interfaces else []

            matched_interfaces_expr = LinExpr(0)
            for lib_interface in interfaces_lib_class:
                for app_interface in interfaces_app_class:
                    interfaces_match = (lib_interface, app_interface)
                    if interfaces_match in class_match_vars:
                        if not assume_flattened_package or (
                                basename(lib_class) == basename(lib_interface)
                                and basename(app_class)
                                == basename(app_interface)):
                            matched_interfaces_expr += class_match_vars[
                                interfaces_match]

            matched_lib_interfaces_expr = LinExpr(0)
            matched_app_interfaces_expr = LinExpr(0)
            for lib_interface in interfaces_lib_class:
                if lib_interface in lib_class_match_count_exprs:
                    matched_lib_interfaces_expr += lib_class_match_count_exprs[
                        lib_interface]
            for app_interface in interfaces_app_class:
                if app_interface in app_class_match_count_exprs:
                    matched_app_interfaces_expr += app_class_match_count_exprs[
                        app_interface]

            m.addConstr(
                2 * matched_interfaces_expr == matched_app_interfaces_expr +
                matched_lib_interfaces_expr)

        for app_class, app_class_interfaces in app_class_interfaces.iteritems(
        ):
            for interface in app_class_interfaces:
                if app_class in app_class_used_vars and interface in app_class_used_vars:
                    app_class_and_interface_matched = m.addVar(
                        vtype=GRB.BINARY)
                    m.addConstr(app_class_used_vars[app_class] >=
                                app_class_and_interface_matched)
                    m.addConstr(app_class_used_vars[interface] >=
                                app_class_and_interface_matched)
                    app_parents_and_interf_matched_expr += app_class_and_interface_matched

    objective_expr = LinExpr(0)

    if use_call_graph_constraints:
        objective_expr += 0.0001 * methods_matched_total_expr + 0.0001 * app_parents_and_interf_matched_expr

    for app_class in app_classnames:
        weight = app_class_weights[app_class]
        objective_expr += weight * app_class_used_vars[app_class]
    m.setObjective(objective_expr, GRB.MAXIMIZE)

    LOGGER.debug('Optimizing...')

    m.optimize()

    matched_app_classes = set()
    class_matches = set()
    for pcm in potential_class_matches:
        if class_match_vars[pcm].x > 0.5:
            class_matches.add(pcm)
            matched_app_classes.add(pcm[1])

    LOGGER.debug('Done')
    LOGGER.debug('Class matches: %s', class_matches)

    # If the log level is DEBUG
    if LOGGER.getEffectiveLevel() == 10:

        unmatched_lib_classes = set(lib_classnames)
        unmatched_app_classes = set(app_classnames)

        class_match_cnt = 0
        for pcm in potential_class_matches:
            if class_match_vars[pcm].x > 0.5:
                class_match_cnt += 1
                if pcm[0] != pcm[1]:
                    LOGGER.debug('Potentially wrong match: %s / %s' % pcm)
                    LOGGER.debug('Lib class methods: ')
                    for lm in lib_method_calls:
                        if lm[0] == pcm[0] or lm[1] == pcm[0]:
                            LOGGER.debug(lm)
                    LOGGER.debug('App class methods: ')
                    for am in app_method_calls:
                        if am[0] == pcm[1] or am[1] == pcm[1]:
                            LOGGER.debug(am)

                if pcm[0] in lib_classnames:
                    unmatched_lib_classes.remove(pcm[0])
                else:
                    LOGGER.debug('Missing lib class: %s' % pcm[0])
                if pcm[1] in app_classnames:
                    unmatched_app_classes.remove(pcm[1])
                else:
                    LOGGER.debug('Missing lib class: %s' % pcm[1])
        LOGGER.debug('%d classes matched', class_match_cnt)
        LOGGER.debug('Unmatched lib classes:')
        for cl in unmatched_lib_classes:
            LOGGER.debug(cl)

        LOGGER.debug('Unmatched app classes:')
        for cl in unmatched_app_classes:
            LOGGER.debug(cl)

        if use_call_graph_constraints:
            LOGGER.debug('Method matches:')
            method_match_cnt = 0
            for mm in method_matching_vars.keys():
                if method_matching_vars[mm].x > 0.5:
                    LOGGER.debug(mm)
                    method_match_cnt += 1

            LOGGER.debug('%d methods matched', method_match_cnt)

        if use_pkg_hierarchy:
            LOGGER.debug('Package matches:')
            package_match_cnt = 0
            for pm in package_matches_vars.keys():
                if package_matches_vars[pm].x > 0.5:
                    LOGGER.debug(pm)
                    package_match_cnt += 1
            LOGGER.debug('%d packages matched', package_match_cnt)

        LOGGER.debug('Active packages:')
        if assume_flattened_package:
            for pkg in flattened_app_pkgs_allowed:
                LOGGER.debug('%s: %s', pkg, app_pkg_active_vars[pkg].x)

        LOGGER.debug('Objective value: %0.4f', m.objval)

    return (m.objval, class_matches)
Esempio n. 19
0
    def _get_package_classes_within_call_graph(self, matched_classes_pairs, lib_name):
        # package could be '' if the root package is /
        package_classes = set()
        matched_app_classes = set(pair[1] for pair in matched_classes_pairs)

        for class_name in matched_app_classes:
            package_name = os.path.dirname(class_name)
            if package_name:
                package_classes.update(self._package_classes[package_name])
            else:
                package_classes.update(class_name)

        if self.mode == MODE.ACCURATE:
            graphs = [self._call_graph.subgraph(package_classes), self._interface_graph.subgraph(
                package_classes), self._superclass_graph.subgraph(package_classes)]
            USG = nx.compose_all(graphs).to_undirected()

            LOGGER.debug("Before removing ghost: %d", len(USG.nodes()))

            lib_ghost_graph = self.LIB_RELATIONSHIP_GRAPHS[lib_name][3]
            for pair in matched_classes_pairs:
                (lib_class, app_class) = pair

                if lib_class in lib_ghost_graph:
                    ghost_relations = lib_ghost_graph.out_edges(
                        lib_class, data=True)

                    for _, ghost_lib_class, info in ghost_relations:
                        relation_type = info["type"]
                        if app_class in graphs[relation_type]:
                            ghost_app_classes = set(graphs[relation_type].neighbors(
                                app_class)) - matched_app_classes

                            if not self.consider_classes_repackaging:
                                ghost_app_classes = set(c for c in ghost_app_classes if c.count(
                                    "/") - app_class.count("/") == ghost_lib_class.count("/") - lib_class.count("/"))
                            
                            if info["type"] == 0:
                                # Call graph
                                for ghost_app_class in ghost_app_classes:
                                    app_call_descriptors = set(
                                        m[:2] for m in graphs[0][app_class][ghost_app_class]["method"])
                                    lib_call_descriptors = set(info["method"])

                                    if ghost_app_class in USG and app_call_descriptors <= lib_call_descriptors:
                                        LOGGER.debug("Ghost app class found: [%d] %s, %s, %s, %s", 0, lib_class, app_class, ghost_lib_class, ghost_app_class)
                                        USG.remove_node(ghost_app_class)
                            else:
                                # Inheritance/Interface graph
                                if ghost_app_classes:
                                    LOGGER.debug("Ghost app classes found: [%d] %s, %s, %s, %s", info["type"], lib_class, app_class, ghost_lib_class, ghost_app_classes)
                                    USG.remove_nodes_from(ghost_app_classes)

            LOGGER.debug("After removing ghost: %d", len(USG.nodes()))
            
            ingraph_classes = set()
            for ssg in nx.connected_component_subgraphs(USG):
                nodes = ssg.nodes()
                matched_nodes = set(nodes).intersection(matched_app_classes)

                # If classes repackaging is considered, it is very possible to mismatch other classes inside the package
                # We set a threshold in this case to remove the influence
                # threshold = 0.05 if self.consider_classes_repackaging else 0

                threshold = 0

                if len(matched_nodes) > len(nodes) * threshold:
                    ingraph_classes.update(nodes)
                else:
                    matched_app_classes -= matched_nodes

            # Some matched_app_classes may not exist in call graph
            ingraph_classes.update(matched_app_classes)

            LOGGER.debug("matched_app_classes (after): %d", len(matched_app_classes))

            return matched_app_classes, ingraph_classes
        else:
            return matched_app_classes, package_classes
Esempio n. 20
0
    json_file_path = path.join(output_dir, profile_type, name)
    if overwrite or not path.exists(json_file_path):
        try:
            analyzer = LibAnalyzer(file_path)

            json_info = analyzer.get_classes_signatures_json_info(
            ) if profile_type == "app" else analyzer.get_lib_classes_signatures_json_info(
            )
            write_to_json(json_file_path, json_info)
            LOGGER.info("The binary profile is stored at %s", json_file_path)
        except Exception, e:
            LOGGER.error("error: %s", e)
            return file_path
    else:
        LOGGER.error("The %s profile (%s) already exists. Use -w to overwrite",
                     profile_type, json_file_path)
        return file_path


def parallel_profiling_binaries(paths,
                                output_folder,
                                profile_type,
                                processes=1,
                                overwrite=False):
    """Profiling Android app/library binaries to JSON files.
    
    Args:
        paths (list): The list of binaries.
        output_folder (str): The folder to store profiles.
        profile_type (str): Either 'app' or 'lib'.
        processes (int, optional): Defaults to 1. The number of processes to use.
Esempio n. 21
0
def search_libs_in_apps(lib_folder=None,
                        lib_profiles=None,
                        app_folder=None,
                        app_profiles=None,
                        mode=MODE.SCALABLE,
                        overwrite=False,
                        output_folder='outputs',
                        repackage=False,
                        processes=None,
                        exclude_builtin=True):
    """Find if specified libraries are used in specified apps. Results will be stored in the `output_folder` as JSON files.

    Must provide either `lib_folder` or `lib_profiles`.

    Must provide either `app_folder` or `app_profiles`.

    Args:
        lib_folder (str, optional): Defaults to None. The folder that contains library binaries.
        lib_profiles (list, optional): Defaults to None. The list of library profiles.
        app_folder (str, optional): Defaults to None. The folder that contains app binaries.
        app_profiles (list, optional): Defaults to None. The list of app profiles.
        mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details.
        overwrite (bool, optional): Defaults to False. Should LibID overwrite the output file if it exists?
        output_folder (str, optional): Defaults to 'outputs'. The folder to store results.
        repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. 
        processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used.
        exclude_builtin (bool, optional): Defaults to True. Should LibID exclude builtin Android libraries (e.g., Android Support V14)? Enable this option can speed up the detection process.
    """

    if not app_profiles:
        if app_folder:
            app_profiles = glob2.glob(path.join(app_folder, "**/*.json"))

    if not lib_profiles:
        if lib_folder:
            lib_profiles = glob2.glob(path.join(lib_folder, "**/*.json"))

    if not overwrite:
        original_profile_num = len(app_profiles)
        app_profiles = [
            fp for fp in app_profiles
            if not path.exists(_get_output_path(fp, output_folder))
        ]

        ignored_profile_num = original_profile_num - len(app_profiles)
        if ignored_profile_num:
            LOGGER.warning(
                "Ignored %i app profiles because the output files already exist. Use -w to overwrite",
                ignored_profile_num)

    if app_profiles and lib_profiles:
        start_time = time.time()
        load_LSH(lib_profiles,
                 mode=mode,
                 repackage=repackage,
                 processes=processes)

        if processes == 1:
            map(
                _search_libs_in_app,
                izip(app_profiles, repeat(mode), repeat(output_folder),
                     repeat(repackage), repeat(exclude_builtin)))
        else:
            pool = Pool(processes=None)
            pool.map(
                _search_libs_in_app,
                izip(app_profiles, repeat(mode), repeat(output_folder),
                     repeat(repackage), repeat(exclude_builtin)))

        end_time = time.time()

        LOGGER.info("Finished. Numer of apps: %d, date: %s, duration: %fs",
                    len(app_profiles),
                    datetime.datetime.now().ctime(), end_time - start_time)
Esempio n. 22
0
                       nargs='+',
                       help='the library profiles')
    group.add_argument('-ld',
                       metavar='FOLDER',
                       type=str,
                       help='the folder that contains library profiles')

    return parser.parse_args()


if __name__ == '__main__':

    args = parse_arguments()

    if args.v:
        LOGGER.setLevel('DEBUG')
    else:
        LOGGER.setLevel('INFO')

    LOGGER.debug("args: %s", args)

    if args.subparser_name == 'profile':
        profile_binaries(base_path=args.d,
                         file_paths=args.f,
                         output_folder=args.o,
                         processes=args.p,
                         overwrite=args.w)
    else:
        search_libs_in_apps(lib_folder=args.ld,
                            lib_profiles=args.lf,
                            app_folder=args.ad,