Esempio n. 1
0
def load_LSH(lib_profiles,
             mode=MODE.SCALABLE,
             repackage=False,
             processes=None):
    """Load library profiles to an LSH object.
    
    Args:
        lib_profiles (list): The list of library profiles.
        mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details.
        repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. 
        processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used.
    """

    global LSH, LIB_RELATIONSHIP_GRAPHS

    weights = (0.5, 0.5) if repackage else (0.1, 0.9)
    LSH = MinHashLSHEnsemble(threshold=LSH_THRESHOLD,
                             num_perm=LSH_PERM_NUM,
                             num_part=32,
                             weights=weights)

    (minhash_list,
     LIB_RELATIONSHIP_GRAPHS) = profiler.parallel_load_libs_profile(
         lib_profiles=lib_profiles,
         mode=mode,
         repackage=repackage,
         processes=processes)

    LOGGER.info("Start indexing LSH (this could take a while) ...")

    start_time = time.time()
    LSH.index(minhash_list)
    end_time = time.time()

    LOGGER.info("LSH indexed. Duration: %fs", end_time - start_time)
Esempio n. 2
0
def parallel_profiling_binaries(paths,
                                output_folder,
                                profile_type,
                                processes=1,
                                overwrite=False):
    """Profiling Android app/library binaries to JSON files.
    
    Args:
        paths (list): The list of binaries.
        output_folder (str): The folder to store profiles.
        profile_type (str): Either 'app' or 'lib'.
        processes (int, optional): Defaults to 1. The number of processes to use.
        overwrite (bool, optional): Defaults to False. Should LibID overwrite the binary profile if it exists?
    """

    start_time = time.time()

    if processes == 1:
        failed_binaries = map(
            _profiling_binary,
            izip(paths, repeat(output_folder), repeat(profile_type),
                 repeat(overwrite)))
    else:
        pool = Pool(processes=processes)
        failed_binaries = pool.map(
            _profiling_binary,
            izip(paths, repeat(output_folder), repeat(profile_type),
                 repeat(overwrite)))

    end_time = time.time()

    failed_binaries = [b for b in failed_binaries if b]

    LOGGER.info("Profiling time: %fs", end_time - start_time)
    LOGGER.info("Failed binaries: %s", failed_binaries)
Esempio n. 3
0
def _profile_libs(dex_files,
                  jar_files,
                  output_folder="profiles",
                  processes=None,
                  overwrite=False):
    # Convert jar file to dex file
    for f in jar_files:
        dex_file_path = path.join(path.dirname(f),
                                  path.basename(f)[:-4] + ".dex")

        if not path.exists(dex_file_path):
            LOGGER.info("Converting %s to %s ...", path.basename(f),
                        path.basename(dex_file_path))
            cmd = "{} -o {} {}".format(DEX2JAR_PATH, dex_file_path, f)

            try:
                subprocess.check_output(cmd, shell=True)
                LOGGER.info("Converted")

                dex_files.append(dex_file_path)
            except:
                LOGGER.error("Conversion failed")
                continue

    if dex_files:
        profiler.parallel_profiling_binaries(dex_files,
                                             output_folder,
                                             "lib",
                                             processes=processes,
                                             overwrite=overwrite)
Esempio n. 4
0
def _export_result_to_json(analyzer, output_path, start_time):
    json_info = analyzer.get_matched_libs_json_info()

    end_time = time.time()
    json_info["time"] = end_time - start_time
    profiler.write_to_json(output_path, json_info)

    LOGGER.info("The result of %s is stored at %s", path.basename(output_path),
                output_path)
Esempio n. 5
0
    def __init__(self, file_path):

        self.a = None
        self.d = []
        self.dx = []

        self.classes_names = []
        self._class_dex = dict()

        self._classes_signatures = dict()
        self._classes_xref_tos = dict()
        self._classes_interfaces = dict()
        self._classes_superclass = dict()

        self.LIB_RELATIONSHIP_GRAPHS = dict()

        LOGGER.info("Start loading %s ...", os.path.basename(file_path))
        self._load_file(file_path)
        LOGGER.info("%s loaded", os.path.basename(file_path))

        # packages = [1st_level_package, ..., last_level_dir]
        # package_contents[package] = [package_contents]
        # Example:
        # packages = [("Landroid"), ..., ("Landroid/support/v4/app/demo")]
        # package_contents["Landroid/support"] = ["Landroid/support/v4",
        # "Landroid/support/v6"]
        self._package_classes = dict()
        self._signature_weight = dict()

        self._call_graph = None
        self._interface_graph = None
        self._superclass_graph = None

        # Classes that been called but not exist in the package
        self._ghost_graph = nx.MultiDiGraph()

        # Library identification related variables
        self._libs_matches = dict()
        self._package_libs_matches = dict()
        self._class_libs_matches = dict()
        self._lib_packages_matches = dict()
        self._lib_info = dict()
        self._lib_shrink_percentage = dict()
        self._lsh_classes = set()

        self._pmatch_app_classes = dict()
        self._pmatch_lib_classes = dict()
        self._pmatch_lib_app_classes = dict()

        self.mode = None
        self.consider_classes_repackaging = True
        self.shrink_threshold = None
        self.similarity_threshold = None
Esempio n. 6
0
    def _get_raw_classes_matches(self, lsh, exclude_builtin):
        start_time = time.time()
        LOGGER.info("Start matching classes ...")

        for class_name in tqdm(self.classes_names):
            # Exclude builtin libraries can speed up the matching
            if exclude_builtin and class_name.startswith(("Landroid/support", "Lcom/google/android/gms")):
                self._class_libs_matches[class_name] = set()
            else:
                matches = self._get_raw_class_matches(class_name, lsh)
                self._class_libs_matches[class_name] = matches

        end_time = time.time()

        LOGGER.info("Classes matching finished. Duration: %fs", end_time - start_time)
Esempio n. 7
0
def _profiling_binary(profiling_info):
    (file_path, output_dir, profile_type, overwrite) = profiling_info
    name = path.splitext(path.basename(file_path))[0] + ".json"

    json_file_path = path.join(output_dir, profile_type, name)
    if overwrite or not path.exists(json_file_path):
        try:
            analyzer = LibAnalyzer(file_path)

            json_info = analyzer.get_classes_signatures_json_info(
            ) if profile_type == "app" else analyzer.get_lib_classes_signatures_json_info(
            )
            write_to_json(json_file_path, json_info)
            LOGGER.info("The binary profile is stored at %s", json_file_path)
        except Exception, e:
            LOGGER.error("error: %s", e)
            return file_path
Esempio n. 8
0
def parallel_load_libs_profile(lib_profiles,
                               mode=MODE.SCALABLE,
                               repackage=False,
                               processes=1):
    """Loading library profiles as a MinHash list and relation graphs.
    
    Args:
        lib_profiles (list): The list of library profiles.
        mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details.
        repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. 
        processes (int, optional): Defaults to 1. The number of processes to use.
    
    Returns:
        tuple: (the minhash list, the relation graph dictionary)
    """

    LOGGER.info("Loading %d library profiles ...", len(lib_profiles))

    start_time = time.time()

    if processes == 1:
        results = map(_load_lib_profile,
                      izip(lib_profiles, repeat(mode), repeat(repackage)))
    else:
        pool = Pool(processes=processes)
        results = pool.map(_load_lib_profile,
                           izip(lib_profiles, repeat(mode), repeat(repackage)))

    end_time = time.time()

    LOGGER.info("Library profiles loaded. Duration: %fs",
                end_time - start_time)

    minhash_list = []
    lib_relationship_graphs_dict = dict()

    for result in results:
        minhash_list += result[1]
        if mode == MODE.ACCURATE:
            lib_relationship_graphs_dict[result[0]] = result[2]

    return (minhash_list, lib_relationship_graphs_dict)
Esempio n. 9
0
    def _match_libraries(self):
        self._get_possible_matches()

        library_matching_start = time.time()
        LOGGER.info("Start matching libraries ...")
        for lib in tqdm(self._pmatch_app_classes):
            # lib = "lib_name|root_package|class_num|sig_num|category|"
            [lib_name, root_package, class_num,
                signature_num, category, _] = lib.split("|")
            self._lib_info[lib_name] = [root_package, category]

            is_match = self._check_if_library_match(lib, lib_name, class_num, signature_num)
            
            if not is_match and self.consider_classes_repackaging:
                LOGGER.debug("Try matching considering class repackaging")
                LOGGER.debug("---------------------------------------------------")
                self._check_if_library_match(lib, lib_name, class_num, signature_num, True)

        library_matching_end = time.time()

        LOGGER.info("Libraries matching finished. Duration: %fs", library_matching_end - library_matching_start)
Esempio n. 10
0
def search_libs_in_apps(lib_folder=None,
                        lib_profiles=None,
                        app_folder=None,
                        app_profiles=None,
                        mode=MODE.SCALABLE,
                        overwrite=False,
                        output_folder='outputs',
                        repackage=False,
                        processes=None,
                        exclude_builtin=True):
    """Find if specified libraries are used in specified apps. Results will be stored in the `output_folder` as JSON files.

    Must provide either `lib_folder` or `lib_profiles`.

    Must provide either `app_folder` or `app_profiles`.

    Args:
        lib_folder (str, optional): Defaults to None. The folder that contains library binaries.
        lib_profiles (list, optional): Defaults to None. The list of library profiles.
        app_folder (str, optional): Defaults to None. The folder that contains app binaries.
        app_profiles (list, optional): Defaults to None. The list of app profiles.
        mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details.
        overwrite (bool, optional): Defaults to False. Should LibID overwrite the output file if it exists?
        output_folder (str, optional): Defaults to 'outputs'. The folder to store results.
        repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. 
        processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used.
        exclude_builtin (bool, optional): Defaults to True. Should LibID exclude builtin Android libraries (e.g., Android Support V14)? Enable this option can speed up the detection process.
    """

    if not app_profiles:
        if app_folder:
            app_profiles = glob2.glob(path.join(app_folder, "**/*.json"))

    if not lib_profiles:
        if lib_folder:
            lib_profiles = glob2.glob(path.join(lib_folder, "**/*.json"))

    if not overwrite:
        original_profile_num = len(app_profiles)
        app_profiles = [
            fp for fp in app_profiles
            if not path.exists(_get_output_path(fp, output_folder))
        ]

        ignored_profile_num = original_profile_num - len(app_profiles)
        if ignored_profile_num:
            LOGGER.warning(
                "Ignored %i app profiles because the output files already exist. Use -w to overwrite",
                ignored_profile_num)

    if app_profiles and lib_profiles:
        start_time = time.time()
        load_LSH(lib_profiles,
                 mode=mode,
                 repackage=repackage,
                 processes=processes)

        if processes == 1:
            map(
                _search_libs_in_app,
                izip(app_profiles, repeat(mode), repeat(output_folder),
                     repeat(repackage), repeat(exclude_builtin)))
        else:
            pool = Pool(processes=None)
            pool.map(
                _search_libs_in_app,
                izip(app_profiles, repeat(mode), repeat(output_folder),
                     repeat(repackage), repeat(exclude_builtin)))

        end_time = time.time()

        LOGGER.info("Finished. Numer of apps: %d, date: %s, duration: %fs",
                    len(app_profiles),
                    datetime.datetime.now().ctime(), end_time - start_time)