def _is_pmatch_reach_threshold(self, lib): lib_signature_num = int(lib.split("|")[3]) LOGGER.debug("shrink percentage (before matching): %s, %f", lib, self._get_shrink_percentage( self._pmatch_app_classes[lib], lib_signature_num)) if self._get_shrink_percentage(self._pmatch_app_classes[lib], lib_signature_num) < self.shrink_threshold: return False return True
def _check_if_library_match(self, lib, lib_name, class_num, signature_num, assume_flattened_package=False): start_time = time.time() weight, matched_classes_pairs = self._match_relationship_graph_for_lib( lib, lib_name, int(class_num), assume_flattened_package) end_time = time.time() LOGGER.debug("graph matching time: %fs", end_time - start_time) matched_app_classes = set(pair[1] for pair in matched_classes_pairs) shrink_percentage = self._get_shrink_percentage( matched_app_classes, signature_num) LOGGER.debug("matched weight: %f", weight) LOGGER.debug("shrink percentage: %f", shrink_percentage) LOGGER.debug("matched classes pairs: %s", matched_classes_pairs) if shrink_percentage > self.shrink_threshold: matched_root_package = self._get_root_package( matched_app_classes) return self._check_package_lib_match( lib_name, matched_root_package, matched_classes_pairs, int(class_num), int(signature_num)) else: return False
def _get_lib_match_similarity(self, matched_classes_pairs, lib_name, lib_class_num, lib_signature_num): LOGGER.debug("matched_app_classes: %d", len(matched_classes_pairs)) LOGGER.debug(matched_classes_pairs) matched_app_classes, package_classes = self._get_package_classes_within_call_graph( matched_classes_pairs, lib_name) shrink_percentage = self._get_shrink_percentage( matched_app_classes, lib_signature_num) LOGGER.debug("shrink percentage (after): %f", shrink_percentage) if shrink_percentage < self.shrink_threshold: return 0 package_classes = [ c for c in package_classes if self._classes_signatures[c]] divide_classes_num = min(len(package_classes), lib_class_num) similarity = len(matched_app_classes) / \ float(divide_classes_num) if divide_classes_num else 0 self._lib_shrink_percentage[lib_name] = self._get_shrink_percentage( package_classes, lib_signature_num) LOGGER.debug("matching info: %s -> %s: %d, %d, %d, %f", self.filename, lib_name, len(matched_app_classes), lib_class_num, len(package_classes), similarity) FILE_LOGGER.debug("%s -> %s: %d, %d, %d, %f", self.filename, lib_name, len( matched_app_classes), lib_class_num, len(package_classes), similarity) return similarity
def _match_libraries(self): self._get_possible_matches() library_matching_start = time.time() LOGGER.info("Start matching libraries ...") for lib in tqdm(self._pmatch_app_classes): # lib = "lib_name|root_package|class_num|sig_num|category|" [lib_name, root_package, class_num, signature_num, category, _] = lib.split("|") self._lib_info[lib_name] = [root_package, category] is_match = self._check_if_library_match(lib, lib_name, class_num, signature_num) if not is_match and self.consider_classes_repackaging: LOGGER.debug("Try matching considering class repackaging") LOGGER.debug("---------------------------------------------------") self._check_if_library_match(lib, lib_name, class_num, signature_num, True) library_matching_end = time.time() LOGGER.info("Libraries matching finished. Duration: %fs", library_matching_end - library_matching_start)
def _match_relationship_graph_for_lib(self, lib, lib_name, lib_class_num, assume_flattened_package=False): LOGGER.debug("lib_name: %s", lib_name) lib_class_names = set(self._pmatch_lib_classes[lib]) app_class_names = set(self._pmatch_app_classes[lib]) potential_class_matches = set(self._pmatch_lib_app_classes[lib]) lib_method_calls, lib_interfaces, lib_superclasses = self._get_relationship_between_classes( lib_class_names, lib_name) app_method_calls, app_interfaces, app_superclasses = self._get_relationship_between_classes( app_class_names) app_class_weights = dict() for class_name in app_class_names: app_class_weights[class_name] = 1.0 / lib_class_num + \ 0.0001 * len(self._classes_signatures[class_name]) childless_packages = set() if self.consider_classes_repackaging: childless_packages = set(os.path.dirname( cn) for cn in app_class_names if not self._check_package_has_subpackage(os.path.dirname(cn))) LOGGER.debug("potential matches: %d, lib calls: %d, method_calls: %d", len( potential_class_matches), len(lib_method_calls), len(app_method_calls)) return match(lib_classnames=lib_class_names, app_classnames=app_class_names, potential_class_matches=potential_class_matches, lib_method_calls=lib_method_calls, app_method_calls=app_method_calls, app_class_weights=app_class_weights, lib_class_parents=lib_superclasses, app_class_parents=app_superclasses, lib_class_interfaces=lib_interfaces, app_class_interfaces=app_interfaces, use_pkg_hierarchy=not self.consider_classes_repackaging, assume_flattened_package=assume_flattened_package, flattened_app_pkgs_allowed=childless_packages)
def get_formatted_method_descriptor(self, encoded_method, class_descriptor, method_descriptor=None): """Replace all obfuscatable names with X Args: encoded_method (dvm.EncodedMethod): The encoded method parsed by Androidguard. class_descriptor (str): The class descriptor. method_descriptor (str, optional): Defaults to None. The method descriptor. Returns: str: Formatted method descriptor. """ descriptor = method_descriptor if method_descriptor else encoded_method.get_descriptor() LOGGER.debug("descriptor: %s", descriptor) splits = re.split(r"\(|\)", descriptor) input_types = splits[1].split(' ') return_types = splits[2].split(' ') types = filter(None, set(input_types).union(return_types)) for _type in types: if _type[-1] == ";" and _type not in config.ANDROID_SDK_CLASSES: descriptor = descriptor.replace(_type, "X") return "%s%s" % (class_descriptor, descriptor)
def _check_package_lib_match(self, lib_name, package, matched_classes_pairs, lib_class_num, lib_signature_num): similarity = self._get_lib_match_similarity( matched_classes_pairs, lib_name, lib_class_num, lib_signature_num) LOGGER.debug("similarity: %s : %f", lib_name, similarity) if similarity > self.similarity_threshold: lib_name_base = lib_name.split("_")[0] + "_" # If there are libraries already matched to the package if package in self._package_libs_matches: existed_lib = [ lib for lib in self._package_libs_matches[package] if lib.startswith(lib_name_base)] # If libraries with the same name have matched to the package if existed_lib: if abs(similarity - self._libs_matches[existed_lib[0]]) < 0.0001: self._bind_lib_to_package( lib_name, similarity, package) elif similarity > self._libs_matches[existed_lib[0]]: for lib in existed_lib: del self._libs_matches[lib] for _package in self._lib_packages_matches[lib]: self._package_libs_matches[_package].remove( lib) del self._lib_packages_matches[lib] self._bind_lib_to_package( lib_name, similarity, package) else: self._bind_lib_to_package(lib_name, similarity, package) else: self._bind_lib_to_package(lib_name, similarity, package) return True return False
def match(lib_classnames, app_classnames, potential_class_matches, lib_method_calls, app_method_calls, app_class_weights, lib_class_parents=None, app_class_parents=None, lib_class_interfaces=None, app_class_interfaces=None, use_pkg_hierarchy=True, assume_flattened_package=False, flattened_app_pkgs_allowed=None, use_call_graph_constraints=True): m = Model("") # If the log level is DEBUG if LOGGER.getEffectiveLevel() == 10: LOGGER.debug('%d lib classes, %d app classes', len(lib_classnames), len(app_classnames)) LOGGER.debug('%d lib methods, %d app methods', len(lib_method_calls), len(app_method_calls)) else: m.setParam('OutputFlag', False) class_match_vars = {} lib_class_match_count_exprs = {} app_class_match_count_exprs = {} for pcm in potential_class_matches: class_match_vars[pcm] = m.addVar(vtype=GRB.BINARY) (lib_class, app_class) = pcm if lib_class not in lib_class_match_count_exprs: lib_class_match_count_exprs[lib_class] = LinExpr(0) lib_class_match_count_exprs[lib_class] += class_match_vars[pcm] if app_class not in app_class_match_count_exprs: app_class_match_count_exprs[app_class] = LinExpr(0) app_class_match_count_exprs[app_class] += class_match_vars[pcm] for expr in lib_class_match_count_exprs.itervalues(): m.addConstr(expr <= 1) for expr in app_class_match_count_exprs.itervalues(): m.addConstr(expr <= 1) app_class_used_vars = {} for app_class in app_classnames: app_class_used_vars[app_class] = m.addVar(vtype=GRB.BINARY) if app_class in app_class_match_count_exprs: m.addConstr(app_class_used_vars[app_class] == app_class_match_count_exprs[app_class]) else: m.addConstr(app_class_used_vars[app_class] == 0) LOGGER.debug('Method matching...') methods_matched_total_expr = LinExpr(0) if use_call_graph_constraints: method_matching_candidates = [ cand for cand in get_method_matching_candidates( lib_method_calls, app_method_calls) ] lib_method_match_count_exprs = {} app_method_match_count_exprs = {} for lib_method_call in lib_method_calls: lib_method_match_count_exprs[lib_method_call] = LinExpr(0) for app_method_call in app_method_calls: app_method_match_count_exprs[app_method_call] = LinExpr(0) method_matching_vars = {} for mm in method_matching_candidates: lib_method_call = mm[0] app_method_call = mm[1] lib_app_class1 = (lib_method_call.class1, app_method_call.class1) lib_app_class2 = (lib_method_call.class2, app_method_call.class2) if lib_app_class1 in class_match_vars and lib_app_class2 in class_match_vars: method_matching_vars[mm] = m.addVar(vtype=GRB.BINARY) m.addConstr(method_matching_vars[mm] <= class_match_vars[lib_app_class1]) m.addConstr(method_matching_vars[mm] <= class_match_vars[lib_app_class2]) lib_method_match_count_exprs[ lib_method_call] += method_matching_vars[mm] app_method_match_count_exprs[ app_method_call] += method_matching_vars[mm] methods_matched_total_expr += 1 * method_matching_vars[mm] LOGGER.debug('Done') for expr in lib_method_match_count_exprs.itervalues(): m.addConstr(expr <= 1) for app_method_call, expr in app_method_match_count_exprs.iteritems(): app_method_class1 = app_method_call.class1 app_method_class2 = app_method_call.class2 tmp = m.addVar(vtype=GRB.BINARY) m.addConstr(tmp == and_(app_class_used_vars[app_method_class1], app_class_used_vars[app_method_class2])) m.addConstr(expr == tmp) if use_pkg_hierarchy: lib_pkg_parent_dict = {} lib_class_pkg_dict = {} app_pkg_parent_dict = {} app_class_pkg_dict = {} process_class_hierarchy(lib_classnames, lib_pkg_parent_dict, lib_class_pkg_dict, ROOT_PKG) process_class_hierarchy(app_classnames, app_pkg_parent_dict, app_class_pkg_dict, ROOT_PKG) LOGGER.debug(lib_pkg_parent_dict) LOGGER.debug(app_pkg_parent_dict) LOGGER.debug(lib_class_pkg_dict) LOGGER.debug(app_class_pkg_dict) lib_pkg_match_cnt_exprs = {} app_pkg_match_cnt_exprs = {} all_lib_pkgs = list(lib_pkg_parent_dict.keys()) + [ROOT_PKG] all_app_pkgs = list(app_pkg_parent_dict.keys()) + [ROOT_PKG] LOGGER.debug('All lib packages: %s', all_lib_pkgs) LOGGER.debug('All app packages: %s', all_app_pkgs) potential_package_matches = list( itertools.product(all_lib_pkgs, all_app_pkgs)) package_matches_vars = {} for (lib_pkg, app_pkg) in potential_package_matches: match_var = m.addVar(vtype=GRB.BINARY, name=('%s/%s' % (lib_pkg, app_pkg))) package_matches_vars[(lib_pkg, app_pkg)] = match_var if lib_pkg not in lib_pkg_match_cnt_exprs: lib_pkg_match_cnt_exprs[lib_pkg] = LinExpr(0) lib_pkg_match_cnt_exprs[lib_pkg] += match_var if app_pkg not in app_pkg_match_cnt_exprs: app_pkg_match_cnt_exprs[app_pkg] = LinExpr(0) app_pkg_match_cnt_exprs[app_pkg] += match_var # Every lib package can be matched to at most one app package for expr in lib_pkg_match_cnt_exprs.itervalues(): m.addConstr(expr <= 1) # Every app package can be matched to at most one lib package for expr in app_pkg_match_cnt_exprs.itervalues(): m.addConstr(expr <= 1) # Packages can only match if their parent packages match too for (lib_pkg, app_pkg) in potential_package_matches: if lib_pkg == ROOT_PKG or app_pkg == ROOT_PKG: continue lib_parent_pkg = lib_pkg_parent_dict[lib_pkg] app_parent_pkg = app_pkg_parent_dict[app_pkg] match_var = package_matches_vars[(lib_pkg, app_pkg)] if (lib_parent_pkg, app_parent_pkg) in package_matches_vars: parent_match_var = package_matches_vars[(lib_parent_pkg, app_parent_pkg)] m.addConstr(match_var <= parent_match_var) else: m.addConstr(match_var == 0) # Classes can only match if their packages also match for pcm in potential_class_matches: (lib_class, app_class) = pcm lib_class_pkg = lib_class_pkg_dict[lib_class] app_class_pkg = app_class_pkg_dict[app_class] ppm = (lib_class_pkg, app_class_pkg) if ppm in potential_package_matches: m.addConstr(class_match_vars[pcm] <= package_matches_vars[ppm]) else: m.addConstr(class_match_vars[pcm] == 0) elif assume_flattened_package: app_pkg_parent_dict = {} app_class_pkg_dict = {} process_class_hierarchy(app_classnames, app_pkg_parent_dict, app_class_pkg_dict, ROOT_PKG) app_pkg_active_vars = {} active_pkgs_cnt_expr = LinExpr(0) if flattened_app_pkgs_allowed is None: flattened_app_pkgs_allowed = app_pkg_parent_dict.keys() else: flattened_app_pkgs_allowed = [ '/' + pkg for pkg in flattened_app_pkgs_allowed ] for pkg in flattened_app_pkgs_allowed: app_pkg_active_vars[pkg] = m.addVar(vtype=GRB.BINARY, name=('%s' % pkg)) active_pkgs_cnt_expr += app_pkg_active_vars[pkg] m.addConstr(active_pkgs_cnt_expr <= 1) for pcm in potential_class_matches: (lib_class, app_class) = pcm app_class_pkg = app_class_pkg_dict[app_class] if app_class_pkg in app_pkg_active_vars: m.addConstr( class_match_vars[pcm] <= app_pkg_active_vars[app_class_pkg] ) else: m.addConstr(class_match_vars[pcm] == 0) app_parents_and_interf_matched_expr = LinExpr(0) # Superclass matching if lib_class_parents: for pcm in potential_class_matches: (lib_class, app_class) = pcm parent_lib = lib_class_parents[ lib_class] if lib_class in lib_class_parents else None parent_app = app_class_parents[ app_class] if app_class in app_class_parents else None if parent_lib: if parent_app: parents_match = (parent_lib, parent_app) if parents_match in class_match_vars.keys(): if not assume_flattened_package or ( basename(lib_class) == basename(parent_lib) and basename(app_class) == basename(parent_app)): m.addConstr(class_match_vars[pcm] <= class_match_vars[parents_match]) else: m.addConstr(class_match_vars[pcm] == 0) else: m.addConstr(class_match_vars[pcm] == 0) else: if parent_app: m.addConstr(1 - class_match_vars[pcm] >= app_class_match_count_exprs[parent_app]) for app_class, app_class_parent in app_class_parents.iteritems(): if app_class in app_class_used_vars and app_class_parent in app_class_used_vars: app_class_and_parent_matched = m.addVar(vtype=GRB.BINARY) m.addConstr(app_class_used_vars[app_class] >= app_class_and_parent_matched) m.addConstr(app_class_used_vars[app_class_parent] >= app_class_and_parent_matched) app_parents_and_interf_matched_expr += app_class_and_parent_matched # Interface matching if lib_class_interfaces: for pcm in potential_class_matches: (lib_class, app_class) = pcm interfaces_lib_class = lib_class_interfaces[ lib_class] if lib_class in lib_class_interfaces else [] interfaces_app_class = app_class_interfaces[ app_class] if app_class in app_class_interfaces else [] matched_interfaces_expr = LinExpr(0) for lib_interface in interfaces_lib_class: for app_interface in interfaces_app_class: interfaces_match = (lib_interface, app_interface) if interfaces_match in class_match_vars: if not assume_flattened_package or ( basename(lib_class) == basename(lib_interface) and basename(app_class) == basename(app_interface)): matched_interfaces_expr += class_match_vars[ interfaces_match] matched_lib_interfaces_expr = LinExpr(0) matched_app_interfaces_expr = LinExpr(0) for lib_interface in interfaces_lib_class: if lib_interface in lib_class_match_count_exprs: matched_lib_interfaces_expr += lib_class_match_count_exprs[ lib_interface] for app_interface in interfaces_app_class: if app_interface in app_class_match_count_exprs: matched_app_interfaces_expr += app_class_match_count_exprs[ app_interface] m.addConstr( 2 * matched_interfaces_expr == matched_app_interfaces_expr + matched_lib_interfaces_expr) for app_class, app_class_interfaces in app_class_interfaces.iteritems( ): for interface in app_class_interfaces: if app_class in app_class_used_vars and interface in app_class_used_vars: app_class_and_interface_matched = m.addVar( vtype=GRB.BINARY) m.addConstr(app_class_used_vars[app_class] >= app_class_and_interface_matched) m.addConstr(app_class_used_vars[interface] >= app_class_and_interface_matched) app_parents_and_interf_matched_expr += app_class_and_interface_matched objective_expr = LinExpr(0) if use_call_graph_constraints: objective_expr += 0.0001 * methods_matched_total_expr + 0.0001 * app_parents_and_interf_matched_expr for app_class in app_classnames: weight = app_class_weights[app_class] objective_expr += weight * app_class_used_vars[app_class] m.setObjective(objective_expr, GRB.MAXIMIZE) LOGGER.debug('Optimizing...') m.optimize() matched_app_classes = set() class_matches = set() for pcm in potential_class_matches: if class_match_vars[pcm].x > 0.5: class_matches.add(pcm) matched_app_classes.add(pcm[1]) LOGGER.debug('Done') LOGGER.debug('Class matches: %s', class_matches) # If the log level is DEBUG if LOGGER.getEffectiveLevel() == 10: unmatched_lib_classes = set(lib_classnames) unmatched_app_classes = set(app_classnames) class_match_cnt = 0 for pcm in potential_class_matches: if class_match_vars[pcm].x > 0.5: class_match_cnt += 1 if pcm[0] != pcm[1]: LOGGER.debug('Potentially wrong match: %s / %s' % pcm) LOGGER.debug('Lib class methods: ') for lm in lib_method_calls: if lm[0] == pcm[0] or lm[1] == pcm[0]: LOGGER.debug(lm) LOGGER.debug('App class methods: ') for am in app_method_calls: if am[0] == pcm[1] or am[1] == pcm[1]: LOGGER.debug(am) if pcm[0] in lib_classnames: unmatched_lib_classes.remove(pcm[0]) else: LOGGER.debug('Missing lib class: %s' % pcm[0]) if pcm[1] in app_classnames: unmatched_app_classes.remove(pcm[1]) else: LOGGER.debug('Missing lib class: %s' % pcm[1]) LOGGER.debug('%d classes matched', class_match_cnt) LOGGER.debug('Unmatched lib classes:') for cl in unmatched_lib_classes: LOGGER.debug(cl) LOGGER.debug('Unmatched app classes:') for cl in unmatched_app_classes: LOGGER.debug(cl) if use_call_graph_constraints: LOGGER.debug('Method matches:') method_match_cnt = 0 for mm in method_matching_vars.keys(): if method_matching_vars[mm].x > 0.5: LOGGER.debug(mm) method_match_cnt += 1 LOGGER.debug('%d methods matched', method_match_cnt) if use_pkg_hierarchy: LOGGER.debug('Package matches:') package_match_cnt = 0 for pm in package_matches_vars.keys(): if package_matches_vars[pm].x > 0.5: LOGGER.debug(pm) package_match_cnt += 1 LOGGER.debug('%d packages matched', package_match_cnt) LOGGER.debug('Active packages:') if assume_flattened_package: for pkg in flattened_app_pkgs_allowed: LOGGER.debug('%s: %s', pkg, app_pkg_active_vars[pkg].x) LOGGER.debug('Objective value: %0.4f', m.objval) return (m.objval, class_matches)
type=str, help='the folder that contains library profiles') return parser.parse_args() if __name__ == '__main__': args = parse_arguments() if args.v: LOGGER.setLevel('DEBUG') else: LOGGER.setLevel('INFO') LOGGER.debug("args: %s", args) if args.subparser_name == 'profile': profile_binaries(base_path=args.d, file_paths=args.f, output_folder=args.o, processes=args.p, overwrite=args.w) else: search_libs_in_apps(lib_folder=args.ld, lib_profiles=args.lf, app_folder=args.ad, app_profiles=args.af, mode=MODE.ACCURATE if args.A else MODE.SCALABLE, overwrite=args.w, output_folder=args.o,
def _get_package_classes_within_call_graph(self, matched_classes_pairs, lib_name): # package could be '' if the root package is / package_classes = set() matched_app_classes = set(pair[1] for pair in matched_classes_pairs) for class_name in matched_app_classes: package_name = os.path.dirname(class_name) if package_name: package_classes.update(self._package_classes[package_name]) else: package_classes.update(class_name) if self.mode == MODE.ACCURATE: graphs = [self._call_graph.subgraph(package_classes), self._interface_graph.subgraph( package_classes), self._superclass_graph.subgraph(package_classes)] USG = nx.compose_all(graphs).to_undirected() LOGGER.debug("Before removing ghost: %d", len(USG.nodes())) lib_ghost_graph = self.LIB_RELATIONSHIP_GRAPHS[lib_name][3] for pair in matched_classes_pairs: (lib_class, app_class) = pair if lib_class in lib_ghost_graph: ghost_relations = lib_ghost_graph.out_edges( lib_class, data=True) for _, ghost_lib_class, info in ghost_relations: relation_type = info["type"] if app_class in graphs[relation_type]: ghost_app_classes = set(graphs[relation_type].neighbors( app_class)) - matched_app_classes if not self.consider_classes_repackaging: ghost_app_classes = set(c for c in ghost_app_classes if c.count( "/") - app_class.count("/") == ghost_lib_class.count("/") - lib_class.count("/")) if info["type"] == 0: # Call graph for ghost_app_class in ghost_app_classes: app_call_descriptors = set( m[:2] for m in graphs[0][app_class][ghost_app_class]["method"]) lib_call_descriptors = set(info["method"]) if ghost_app_class in USG and app_call_descriptors <= lib_call_descriptors: LOGGER.debug("Ghost app class found: [%d] %s, %s, %s, %s", 0, lib_class, app_class, ghost_lib_class, ghost_app_class) USG.remove_node(ghost_app_class) else: # Inheritance/Interface graph if ghost_app_classes: LOGGER.debug("Ghost app classes found: [%d] %s, %s, %s, %s", info["type"], lib_class, app_class, ghost_lib_class, ghost_app_classes) USG.remove_nodes_from(ghost_app_classes) LOGGER.debug("After removing ghost: %d", len(USG.nodes())) ingraph_classes = set() for ssg in nx.connected_component_subgraphs(USG): nodes = ssg.nodes() matched_nodes = set(nodes).intersection(matched_app_classes) # If classes repackaging is considered, it is very possible to mismatch other classes inside the package # We set a threshold in this case to remove the influence # threshold = 0.05 if self.consider_classes_repackaging else 0 threshold = 0 if len(matched_nodes) > len(nodes) * threshold: ingraph_classes.update(nodes) else: matched_app_classes -= matched_nodes # Some matched_app_classes may not exist in call graph ingraph_classes.update(matched_app_classes) LOGGER.debug("matched_app_classes (after): %d", len(matched_app_classes)) return matched_app_classes, ingraph_classes else: return matched_app_classes, package_classes