def load_LSH(lib_profiles, mode=MODE.SCALABLE, repackage=False, processes=None): """Load library profiles to an LSH object. Args: lib_profiles (list): The list of library profiles. mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details. repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used. """ global LSH, LIB_RELATIONSHIP_GRAPHS weights = (0.5, 0.5) if repackage else (0.1, 0.9) LSH = MinHashLSHEnsemble(threshold=LSH_THRESHOLD, num_perm=LSH_PERM_NUM, num_part=32, weights=weights) (minhash_list, LIB_RELATIONSHIP_GRAPHS) = profiler.parallel_load_libs_profile( lib_profiles=lib_profiles, mode=mode, repackage=repackage, processes=processes) LOGGER.info("Start indexing LSH (this could take a while) ...") start_time = time.time() LSH.index(minhash_list) end_time = time.time() LOGGER.info("LSH indexed. Duration: %fs", end_time - start_time)
def parallel_profiling_binaries(paths, output_folder, profile_type, processes=1, overwrite=False): """Profiling Android app/library binaries to JSON files. Args: paths (list): The list of binaries. output_folder (str): The folder to store profiles. profile_type (str): Either 'app' or 'lib'. processes (int, optional): Defaults to 1. The number of processes to use. overwrite (bool, optional): Defaults to False. Should LibID overwrite the binary profile if it exists? """ start_time = time.time() if processes == 1: failed_binaries = map( _profiling_binary, izip(paths, repeat(output_folder), repeat(profile_type), repeat(overwrite))) else: pool = Pool(processes=processes) failed_binaries = pool.map( _profiling_binary, izip(paths, repeat(output_folder), repeat(profile_type), repeat(overwrite))) end_time = time.time() failed_binaries = [b for b in failed_binaries if b] LOGGER.info("Profiling time: %fs", end_time - start_time) LOGGER.info("Failed binaries: %s", failed_binaries)
def _profile_libs(dex_files, jar_files, output_folder="profiles", processes=None, overwrite=False): # Convert jar file to dex file for f in jar_files: dex_file_path = path.join(path.dirname(f), path.basename(f)[:-4] + ".dex") if not path.exists(dex_file_path): LOGGER.info("Converting %s to %s ...", path.basename(f), path.basename(dex_file_path)) cmd = "{} -o {} {}".format(DEX2JAR_PATH, dex_file_path, f) try: subprocess.check_output(cmd, shell=True) LOGGER.info("Converted") dex_files.append(dex_file_path) except: LOGGER.error("Conversion failed") continue if dex_files: profiler.parallel_profiling_binaries(dex_files, output_folder, "lib", processes=processes, overwrite=overwrite)
def _export_result_to_json(analyzer, output_path, start_time): json_info = analyzer.get_matched_libs_json_info() end_time = time.time() json_info["time"] = end_time - start_time profiler.write_to_json(output_path, json_info) LOGGER.info("The result of %s is stored at %s", path.basename(output_path), output_path)
def __init__(self, file_path): self.a = None self.d = [] self.dx = [] self.classes_names = [] self._class_dex = dict() self._classes_signatures = dict() self._classes_xref_tos = dict() self._classes_interfaces = dict() self._classes_superclass = dict() self.LIB_RELATIONSHIP_GRAPHS = dict() LOGGER.info("Start loading %s ...", os.path.basename(file_path)) self._load_file(file_path) LOGGER.info("%s loaded", os.path.basename(file_path)) # packages = [1st_level_package, ..., last_level_dir] # package_contents[package] = [package_contents] # Example: # packages = [("Landroid"), ..., ("Landroid/support/v4/app/demo")] # package_contents["Landroid/support"] = ["Landroid/support/v4", # "Landroid/support/v6"] self._package_classes = dict() self._signature_weight = dict() self._call_graph = None self._interface_graph = None self._superclass_graph = None # Classes that been called but not exist in the package self._ghost_graph = nx.MultiDiGraph() # Library identification related variables self._libs_matches = dict() self._package_libs_matches = dict() self._class_libs_matches = dict() self._lib_packages_matches = dict() self._lib_info = dict() self._lib_shrink_percentage = dict() self._lsh_classes = set() self._pmatch_app_classes = dict() self._pmatch_lib_classes = dict() self._pmatch_lib_app_classes = dict() self.mode = None self.consider_classes_repackaging = True self.shrink_threshold = None self.similarity_threshold = None
def _get_raw_classes_matches(self, lsh, exclude_builtin): start_time = time.time() LOGGER.info("Start matching classes ...") for class_name in tqdm(self.classes_names): # Exclude builtin libraries can speed up the matching if exclude_builtin and class_name.startswith(("Landroid/support", "Lcom/google/android/gms")): self._class_libs_matches[class_name] = set() else: matches = self._get_raw_class_matches(class_name, lsh) self._class_libs_matches[class_name] = matches end_time = time.time() LOGGER.info("Classes matching finished. Duration: %fs", end_time - start_time)
def _profiling_binary(profiling_info): (file_path, output_dir, profile_type, overwrite) = profiling_info name = path.splitext(path.basename(file_path))[0] + ".json" json_file_path = path.join(output_dir, profile_type, name) if overwrite or not path.exists(json_file_path): try: analyzer = LibAnalyzer(file_path) json_info = analyzer.get_classes_signatures_json_info( ) if profile_type == "app" else analyzer.get_lib_classes_signatures_json_info( ) write_to_json(json_file_path, json_info) LOGGER.info("The binary profile is stored at %s", json_file_path) except Exception, e: LOGGER.error("error: %s", e) return file_path
def parallel_load_libs_profile(lib_profiles, mode=MODE.SCALABLE, repackage=False, processes=1): """Loading library profiles as a MinHash list and relation graphs. Args: lib_profiles (list): The list of library profiles. mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details. repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. processes (int, optional): Defaults to 1. The number of processes to use. Returns: tuple: (the minhash list, the relation graph dictionary) """ LOGGER.info("Loading %d library profiles ...", len(lib_profiles)) start_time = time.time() if processes == 1: results = map(_load_lib_profile, izip(lib_profiles, repeat(mode), repeat(repackage))) else: pool = Pool(processes=processes) results = pool.map(_load_lib_profile, izip(lib_profiles, repeat(mode), repeat(repackage))) end_time = time.time() LOGGER.info("Library profiles loaded. Duration: %fs", end_time - start_time) minhash_list = [] lib_relationship_graphs_dict = dict() for result in results: minhash_list += result[1] if mode == MODE.ACCURATE: lib_relationship_graphs_dict[result[0]] = result[2] return (minhash_list, lib_relationship_graphs_dict)
def _match_libraries(self): self._get_possible_matches() library_matching_start = time.time() LOGGER.info("Start matching libraries ...") for lib in tqdm(self._pmatch_app_classes): # lib = "lib_name|root_package|class_num|sig_num|category|" [lib_name, root_package, class_num, signature_num, category, _] = lib.split("|") self._lib_info[lib_name] = [root_package, category] is_match = self._check_if_library_match(lib, lib_name, class_num, signature_num) if not is_match and self.consider_classes_repackaging: LOGGER.debug("Try matching considering class repackaging") LOGGER.debug("---------------------------------------------------") self._check_if_library_match(lib, lib_name, class_num, signature_num, True) library_matching_end = time.time() LOGGER.info("Libraries matching finished. Duration: %fs", library_matching_end - library_matching_start)
def search_libs_in_apps(lib_folder=None, lib_profiles=None, app_folder=None, app_profiles=None, mode=MODE.SCALABLE, overwrite=False, output_folder='outputs', repackage=False, processes=None, exclude_builtin=True): """Find if specified libraries are used in specified apps. Results will be stored in the `output_folder` as JSON files. Must provide either `lib_folder` or `lib_profiles`. Must provide either `app_folder` or `app_profiles`. Args: lib_folder (str, optional): Defaults to None. The folder that contains library binaries. lib_profiles (list, optional): Defaults to None. The list of library profiles. app_folder (str, optional): Defaults to None. The folder that contains app binaries. app_profiles (list, optional): Defaults to None. The list of app profiles. mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details. overwrite (bool, optional): Defaults to False. Should LibID overwrite the output file if it exists? output_folder (str, optional): Defaults to 'outputs'. The folder to store results. repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used. exclude_builtin (bool, optional): Defaults to True. Should LibID exclude builtin Android libraries (e.g., Android Support V14)? Enable this option can speed up the detection process. """ if not app_profiles: if app_folder: app_profiles = glob2.glob(path.join(app_folder, "**/*.json")) if not lib_profiles: if lib_folder: lib_profiles = glob2.glob(path.join(lib_folder, "**/*.json")) if not overwrite: original_profile_num = len(app_profiles) app_profiles = [ fp for fp in app_profiles if not path.exists(_get_output_path(fp, output_folder)) ] ignored_profile_num = original_profile_num - len(app_profiles) if ignored_profile_num: LOGGER.warning( "Ignored %i app profiles because the output files already exist. Use -w to overwrite", ignored_profile_num) if app_profiles and lib_profiles: start_time = time.time() load_LSH(lib_profiles, mode=mode, repackage=repackage, processes=processes) if processes == 1: map( _search_libs_in_app, izip(app_profiles, repeat(mode), repeat(output_folder), repeat(repackage), repeat(exclude_builtin))) else: pool = Pool(processes=None) pool.map( _search_libs_in_app, izip(app_profiles, repeat(mode), repeat(output_folder), repeat(repackage), repeat(exclude_builtin))) end_time = time.time() LOGGER.info("Finished. Numer of apps: %d, date: %s, duration: %fs", len(app_profiles), datetime.datetime.now().ctime(), end_time - start_time)