def bucket_create(self, key, val, metadata_dict = {}): ''' Create an object in the bucket, but only if not yet present (save traffic). Parameters --------- key : str val : file-like object metadata_dict : dict Returns ------- Key ''' s3_key = Key(self.apk_bucket) s3_key.key = key # important: set metadata before actual upload s3_key.metadata = metadata_dict s3_key.content_type = 'application/vnd.android.package-archive' # upload log.debug("uploading %s", s3_key.key) s3_key.set_contents_from_file(val, replace = False) return s3_key
def get_hash(self): ''' Get the sha256 message digest of the file and store it. Returns ------- str sha256 message digest as hexstring None If path is None Raises ------ OSError If the file could no be opened ''' if self._get_hash() is None: if self.path is None: # cannot calculate message digest from file return None else: with open(self.path, "rb") as apkf: self.hash = sha256(apkf.read()) log.debug("Calculated hash for %s by reading file %s", self, self.path) return self._get_hash()
def delete_results(self, where = None, non_document = False, **kwargs): ''' See doc of :py:meth:`.ResultStorageInterface.delete_results` ''' coll = self.__get_collection(gridfs_obj = non_document) if where is None: where = {} where.update(self.create_where_clause(kwargs, from_gridfs = non_document)) n = 0 try: # do the query log.debug("mongodb remove(%s)", where) # gridfs if non_document: # get ids and delete for _id in self.get_ids(where = where, non_document = non_document): coll.delete(_id) log.debug("Deleted element with id: %s from mongodb gridfs!", _id) n += 1 # normal collection else: write_result = coll.remove(where, getLastError=True) if write_result is not None: n = write_result["n"] return n except PyMongoError as e: log.exception(DatabaseDeleteException(self, where, e)) return n
def bucket_create(self, key, val, metadata_dict={}): ''' Create an object in the bucket, but only if not yet present (save traffic). Parameters --------- key : str val : file-like object metadata_dict : dict Returns ------- Key ''' s3_key = Key(self.apk_bucket) s3_key.key = key # important: set metadata before actual upload s3_key.metadata = metadata_dict s3_key.content_type = 'application/vnd.android.package-archive' # upload log.debug("uploading %s", s3_key.key) s3_key.set_contents_from_file(val, replace=False) return s3_key
def __init__(self, config_filename, import_db = None): ''' Parameters ---------- config_filename : str, optional (default is `settings.CONFIG_PATH`) The path to the config to load. import_db : str, optional (default is read from config file) Path to the import db. ''' # type: Settings if config_filename is None: config_filename = settings.CONFIG_PATH # create settings variable self.__settings = Settings(config_filename, default_path = settings.DEFAULTS_PATH) log.debug("config file settings: %s\n\tCLI options may overwrite them!", self.__settings) # load and set androguard path from configs Util.set_androguard_path(self.settings) # type: str import_db = self._get_import_db(import_db = import_db) #self.args.import_database log.info("Using import database: %s", import_db) # load a few other settings self.__storage = self._create_storage(import_db)
def __init__(self, *args, **kwargs): ''' A task will be initialized for every process, but not for every task! ''' Task.__init__(self, *args, **kwargs) self.__result_database_storage = None self.__apk_storage = None self.__script_hashes = None self.__androscripts = None # register signal to prefetch apks task_prerun.connect(self.prefetch_apk) log.debug("%s init", self)
def store_result_for_apk(self, apk, script): ''' See doc of :py:meth:`.ResultWritingInterface.store_result_for_apk`. Returns ------- tuple<str, bool> First component is the id of the entry and the second a boolean indication if the result has been stored in gridfs. None If an error occurred. ''' try: # escape keys for mongodb insert res_obj_dict = escape_keys(script.result_dict(gen_id = False)) _id = script.gen_unique_id() # if data is to big or custom result object used -> store with gridfs if script.uses_custom_result_object() or script.is_big_res(): log.debug("storing results for %s, %s in %s (id: %s)", apk.short_description(), script, self.grid_fs, _id) result = self.get_custom_res_obj_representation(script) gridfs = self.grid_fs # gridfs doesn't have an update method -> delete and insert if gridfs.exists(**{RESOBJ_ID : _id}): # delete by _id gridfs.delete(_id) # store file together with metadata from `ResultObject` gridfs.put(result, metadata = res_obj_dict, filename = script.get_file_name(), _id = _id) # return id return _id, True # normal json data else: log.debug("storing results for %s, %s in %s db(id: %s)", apk.short_description(), script, self.res_coll, _id) # set id so we don't have multiple results for same script and apk res_obj_dict[RESOBJ_ID] = _id # update or insert if not existing self.res_coll.update({RESOBJ_ID : _id}, res_obj_dict, upsert = True) # return id return _id, False except (PyMongoError, BSONError) as e: raise DatabaseStoreException(self, "script: %s" % script, caused_by = e), None, sys.exc_info()[2]
def copy_apk(self, apk, file_like_obj, **kwargs): ''' Copy the `apk` to the file system (path specified through `store_root_dir`). See also: :py:meth:`.ApkCopyInterface.copy_apk`. Parameters ---------- apk: Apk Holds meta information needed to create the subdirectory names. file_like_obj A file-like object which holds the .apk data Raises ------ IOError FileSysCreateStorageStructureException Returns ------- str The path were the apk file has been copied ''' apk_file_path = self.get_apk_import_file_name(apk) log.debug("copying %s to %s", apk.short_description(), apk_file_path) # create path for apk if not existing apk_import_path = None try: apk_import_path = self.get_apk_import_path(apk) if not exists(apk_import_path): makedirs(apk_import_path) except OSError as e: raise FileSysCreateStorageStructureException(apk_import_path, self, e), None, sys.exc_info()[2] # copy apk with open(apk_file_path, "wb") as apk_copy: file_like_obj.seek(0) apk_copy.write(file_like_obj.read()) return apk_file_path
def callback(task_id, result_dict): ''' Parameters ---------- task_id : str UUID of task. result_dict : dict Dictionary holding the meta infos about the task as well as the result. See `CeleryConstants.CELERY_RESULT_BACKEND_*` for some available keys. ''' log.debug("Task %s finished", task_id) result = result_dict[ CeleryConstants.CELERY_RESULT_BACKEND_KEY_RESULT] traceback = result_dict[ CeleryConstants.CELERY_RESULT_BACKEND_KEY_TRACEBACK] state = result_dict[ CeleryConstants.CELERY_RESULT_BACKEND_KEY_STATUS] task_failed = state in states.EXCEPTION_STATES # show exceptions if task_failed: # handle error if handle_error is not None: handle_error(task_id, result, state, traceback) # we need locking here because operation is not atomic with self.lock: self.analyze_stats_view.failed_tasks += 1 else: if handle_success is not None: handle_success(task_id, result) # we need locking here because operation is not atomic with self.lock: self.analyze_stats_view.successful_tasks += 1
def store_result_for_apk(self, apk, script): ''' Store the results in the file system. If a custom result object is used in `script` and it's not a `ResultObject`, str(custom res object) will be used for writing to disk. Parameters ---------- apk: Apk script: AndroScript Raises ------ FileSysStoreException Returns ------- str Path to result file. ''' try: res_filename = self.get_apk_res_filename(apk, script) with open(res_filename, "w") as f: log.debug("storing results for %s, %s to %s", apk.short_description(), script, res_filename) if not script.uses_custom_result_object(): f.write(script.res.write_to_json()) else: res = self.get_custom_res_obj_representation(script) # log json if custom res obj is `ResultObject if ScriptUtil.is_result_object(res): res = res.write_to_json() f.write(res) return res_filename except IOError as e: raise FileSysStoreException(res_filename, str(apk), self, e)
def callback(task_id, result_dict): ''' Parameters ---------- task_id : str UUID of task. result_dict : dict Dictionary holding the meta infos about the task as well as the result. See `CeleryConstants.CELERY_RESULT_BACKEND_*` for some available keys. ''' log.debug("Task %s finished", task_id) result = result_dict[CeleryConstants.CELERY_RESULT_BACKEND_KEY_RESULT] traceback = result_dict[CeleryConstants.CELERY_RESULT_BACKEND_KEY_TRACEBACK] state = result_dict[CeleryConstants.CELERY_RESULT_BACKEND_KEY_STATUS] task_failed = state in states.EXCEPTION_STATES # show exceptions if task_failed: # handle error if handle_error is not None: handle_error(task_id, result, state, traceback) # we need locking here because operation is not atomic with self.lock: self.analyze_stats_view.failed_tasks += 1 else: if handle_success is not None: handle_success(task_id, result) # we need locking here because operation is not atomic with self.lock: self.analyze_stats_view.successful_tasks += 1
def before_task_publish_action(self, *args, **kwargs): ''' Collect task ids before they get published ''' task_id = kwargs["body"]["id"] log.debug("will publish task %s", task_id) self.task_collection.task_ids.append(task_id)
def _analyze(self): ''' See doc of :py:method:`.BaseAnalyzer.analyze`. ''' # try to get registered workers # it network fails at this point -> stop analysis try: clilog.info(CeleryUtil.get_workers_and_check_network()) except NetworkError as e: log.critical(e) return 0 # storage objects storage = self.storage clilog.info("Number of apks to analyze: %d", self._cnt_apks) try: # get analyze task analyze_task = tasks[CeleryConstants.get_analyze_task_name()] # create storage storage.create_or_open_sub_storages() # send tasks start = time() # apk generator over .apk or apk hashes apk_gen = AnalyzeUtil.apk_id_or_raw_data_gen( self.apks, force_raw_data=self.serialize_apks) clilog.info("Task publishing progress:") # send and serialize .apks # if analysis via path serialize them! if self.serialize_apks: log.info("sending .apks to message broker") self.group_result = group_result = GroupResult(results=[]) for args in self.send_apk_args_generator(apk_gen): task = analyze_task.delay(*args) group_result.add(task) # send only apk id and let fetch via mongodb else: log.info("sending ids of apks") task_group = group( (analyze_task.s(*args) for args in self.send_id_args_generator(apk_gen))) # publish tasks self.group_result = task_group() log.info("sending took %ss", (time() - start)) sys.stderr.write("\nAnalysis progress:\n") # start showing analysis progress self.analyze_stats_view.start() # wait for results log.debug("joining on ResultGroup ... ") # setup callback callback_func = self.get_callback_func(self.success_handler, self.error_handler) CeleryUtil.join_native(self.group_result, propagate=False, callback=callback_func) clilog.info("\nanalysis done ... ") log.info("distributed analysis took %ss", (time() - start)) return self.stop_analysis_view() except DatabaseOpenError as e: log.critical(e) return 0 except (KeyboardInterrupt, Exception) as e: if not isinstance(e, KeyboardInterrupt): log.exception(e) log.warn( "Interrupting distributed analysis ... Please wait a moment!") log.warn("revoking tasks on all workers ...") if celerysettings.CELERY_TASK_REVOCATION_ENABLED: # revoke tasks if self.group_result is None: # revoke via task ids log.debug("revoking while publishing tasks ...") self.task_collection.revoke_all(terminate=True, signal='SIGKILL') else: # revoke via GroupResult if yet available/created # first available after all tasks have been send self.group_result.revoke(terminate=True, signal='SIGKILL') log.warn("revoked tasks and killed workers ...") #return number of analyzed apks return self.stop_analysis_view()
def _analyze(self): ''' See doc of :py:method:BaseAnalyzer.analyze`. ''' try: work_queue = self.work_queue # create worker pool log.debug("starting %s workers ...", self.concurrency) for _ in range(self.concurrency): p = Worker(self.script_list, self.script_hashes, self.min_script_needs, work_queue, self.storage, self.cnt_analyzed_apks, self.analyzed_apks, self.storage_results) self.workers.append(p) p.daemon = True # start workers for p in self.workers: p.start() # queue has size limit -> start workers first then enqueue items log.info("Loading apk paths into work queue ...") for apk_stuff in AnalyzeUtil.apk_gen(self.apks_or_paths): # task is apk with all scripts work_queue.put(apk_stuff) for _ in range(self.concurrency): # signal end-of-work work_queue.put(STOP_SENTINEL) # progress view for cli av = AnalysisStatsView(self.cnt_analyzed_apks, self._cnt_apks, self.analyzed_apks) av.daemon = True av.start() # block until workers finished work_queue.join() av.terminate() log.debug("joined on work queue ...") return self.cnt_analyzed_apks.value # try hot shutdown first except KeyboardInterrupt: log.warn("Hot shutdown ... ") try: log.warn("clearing work queue ... ") Util.clear_queue(work_queue) log.warn("cleared work queue ... ") for _ in range(self.concurrency): # signal end-of-work work_queue.put(STOP_SENTINEL) for worker in self.workers: worker.join() log.warn("waited for all workers ... ") return self.cnt_analyzed_apks.value # if user really wants make a cold shutdown -> kill processes except KeyboardInterrupt: log.warn("Cold shutdown ... ") log.warn("Hard shutdown wanted! Killing all workers!") # kill processes via SIGINT -> send CTRL-C for w in self.workers: try: os.kill(w.pid, signal.SIGINT) except: pass return self.cnt_analyzed_apks.value
def __recreate_collections(self, gridfs = False, res_collection = False): ''' Drop and recreate collections. Parameters ---------- gridfs : bool, optional (default is False) Recreate gridfs. res_collection, bool, optional (default is False) Recreate results collection. ''' try: if gridfs: log.debug("dropping collection %s", GRIDFS_COLLS_PREFIX) log.debug("dropping collection %s", FILES_COLL_NAME) self.db.drop_collection(FILES_COLL_NAME) log.debug("dropping collection %s", CHUNKS_COLL_NAME) self.db.drop_collection(CHUNKS_COLL_NAME) log.debug("recreating collection %s", GRIDFS_COLLS_PREFIX) self._open_gridfs() self._create_idx_for_colls() except PyMongoError as e: log.critical(e) try: if res_collection: log.debug("dropping collection %s", RESULT_DOCUMENTS_COLLECTION_NAME) self.db.drop_collection(RESULT_DOCUMENTS_COLLECTION_NAME) self._open_res_coll() log.debug("recreating collection %s", RESULT_DOCUMENTS_COLLECTION_NAME) except PyMongoError as e: log.critical(e)
def __init__(self, db_name = None, dest_addr = None, dest_port = None, # auth username = None, passwd = None, # ssl use_ssl = False, ssl_ca_certs = None, ): ''' Create (if not existing) and open the database and collections. Parameters ---------- db_name : str, optional (default is "res") The name of the database to use. Will be created if not already existing. dest_addr : str, optional (default is '127.0.0.1') Address of mongodb database server. dest_port : int, optional (default is 27017) Port of mongodb database server. username : str, optional (default is None) No authentication at all. passwd : str, optional (default is None) No authentication at all. use_ssl : bool, optional (default is False) Use ssl for the connection. ssl_ca_certs : str, optional (default is None) The CA certificate. Raises ------ DatabaseOpenError ''' # db name not allowed if db_name == APK_DB_NAME: raise DatabaseOpenError(db_name, msg = 'Database name "%s" reserved for apk storage!' % db_name), None, sys.exc_info()[2] # set default values if db_name is None: db_name = 'res' if dest_addr is None: dest_addr = '127.0.0.1' if dest_port is None: dest_port = 27017 try: self.__db_name = db_name self.__dest_addr = dest_addr self.__dest_port = dest_port self.__use_ssl = use_ssl # only pass ssl parameters if ssl enabled ssl_params = dict(ssl = use_ssl, ssl_cert_reqs = ssl.CERT_NONE) if use_ssl else {} # set None cause if connection cannot be initiated, conn var will not in scope self.conn = None self.__conn = conn = pymongo.MongoClient(host = dest_addr, port = dest_port, **ssl_params) # authentication is per database! # do auth before probable db creation etc. if None not in (username, passwd): # authenticate if credentials given log.debug("authenticating with mongodb ...") conn["admin"].authenticate(username, passwd) else: log.debug("not authenticating with mongodb ... no credentials supplied!") self.__db = conn[self.db_name] # apk db self.__apk_db = conn[APK_DB_NAME] self.__apk_coll = gridfs.GridFS(self.__apk_db, GRIDFS_COLLS_PREFIX) # create/open collections self.__res_coll = self._open_res_coll() self.__files_coll = self.__db[GRIDFS_COLLS_PREFIX][GRIDFS_FILES] # grid fs for binary files, supports files > 16 mb self.__grid_fs = self._open_gridfs() # create indexes self._create_idx_for_colls() log.info("Opened database: %s", self) log.debug("CA certificate: %s", ssl_ca_certs) except PyMongoError as e: raise DatabaseOpenError(str(self), caused_by = e), None, sys.exc_info()[2]
def get_results(self, include_fields = None, exclude_fields = None, where = None, distinct_key = None, n = None, sort = True, latest = False, non_document = False, non_document_raw = False, remove_id_field = True, **kwargs): ''' See doc of :py:meth:`.ResultStorageInterface.get_results` ''' if include_fields is not None and exclude_fields is not None: raise ValueError("include_fields and exclude_fields are mutually exclusive!") if include_fields is None: include_fields = [] if exclude_fields is None: exclude_fields = [] if where is None: where = {} # latest means enable sorting and only return one result if latest: sort = True n = 1 # create projection dict fields = [(p, 0) for p in exclude_fields] + [(p, 1) for p in include_fields] if remove_id_field: # we don't want the id field fields += [(RESOBJ_ID, 0)] select = dict(fields) # no projection criteria given, disable! # because empty dict means only id if not select: select = None where.update(self.create_where_clause(kwargs, from_gridfs = non_document)) try: res_cursor = None # get appropriate collection coll = self.__get_collection(gridfs_files_coll = non_document and not non_document_raw, gridfs_obj = non_document and non_document_raw) # pymongo 3.0 removed the as_class option in the collection.find method # this is the fix find_kwargs = {} if int(pymongo.version[0]) < 3: find_kwargs['as_class'] = OrderedDict # grid fs if non_document: if non_document_raw: log.debug("mongodb query: find(%s) on gridfs", where) res_cursor = coll.find(where) else: # using the gridfs files collection directly enables us projection an attributes log.debug("mongodb query: find(%s, %s) ", where, select) res_cursor = coll.find(where, select, **find_kwargs) # normal collection else: res_cursor = coll.find(where, select, **find_kwargs) log.debug("mongodb query: find(%s, %s) ", where, select) # enable sorting if wanted if sort: # construct sorting criteria structure, structure is different if using gridfs sort_crit = [( MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_ANALYSIS_DATE, gridfs=non_document) , -1)] res_cursor = res_cursor.sort(sort_crit) # limit results if wanted if n is not None: res_cursor = res_cursor.limit(n) # generator that abstracts if normal collection or is gridfs if non_document: if non_document_raw: return res_cursor if distinct_key is not None: res_cursor = res_cursor.distinct(distinct_key) return res_cursor except PyMongoError as e: raise DatabaseLoadException(self, "find(%s, %s)", where, select, caused_by = e), None, sys.exc_info()[2]
def __del__(self): ''' Close db connection ''' if self.conn is not None: log.debug("Closing db connection ... ") self.conn.close()
def fetch_results_from_mongodb(self, rds, results, wait_for_db = True, # progress nice_progess = False, synced_entries = None, total_sync_entries = None): ''' Fetch some results from the result database and write them to disk. If data cannot be loaded from db, try until it can be. Parameters ---------- rds : ResultDatabaseStorage The database to query for the results. results : list< tuple<id, gridfs (bool)> > Define which results shall be fetched. wait_for_db : bool, optional (default is True) Wait until data could be fetched from db. nice_progess : bool, optional (default is False) If enabled update show some nice progress bar on the cli. synced_entries : multiprocessing.Value<int>, optional (default is None) If supplied store number of already synces entries. total_sync_entries : multiprocessing.Value<int>, optional (default is None) If supplied store number of total entries to sync. Raises ------ DatabaseLoadException If `wait_for_db` is False and an error occurred. ''' # retry in ... seconds DATABASE_RETRY_TIME = 5 # if true assume both counts are shared memory (Value) use_shared_memory = synced_entries is not None and total_sync_entries is not None if results is not None: results_stored = False while not results_stored: try: # get ids non_gridfs_ids, gridfs_ids = MongoUtil.split_result_ids(results) # counts cnt_non_gridfs_ids = len(non_gridfs_ids) cnt_gridfs_ids = len(gridfs_ids) if use_shared_memory: total_sync_entries.value = cnt_gridfs_ids + cnt_non_gridfs_ids # gridfs raw data as well as metadata gridfs_entries_raw = [] if gridfs_ids: gridfs_entries_raw = rds.get_results_for_ids(gridfs_ids, non_document = True, non_document_raw = True) # regular documents (non gridfs) non_gridfs_entries = [] if non_gridfs_ids: non_gridfs_entries = rds.get_results_for_ids(non_gridfs_ids, non_document = False, non_document_raw = True) if not nice_progess: log.debug("fetching %d non-documents (gridfs) ... ", cnt_gridfs_ids) for i, gridfs_entry_raw in enumerate(gridfs_entries_raw, 1): # get our stored metadata (for script and apk) gridfs_entry_meta = gridfs_entry_raw.metadata if not nice_progess: log.debug("getting results for %s", gridfs_entry_meta[RESOBJ_APK_META][RESOBJ_APK_META_PACKAGE_NAME]) else: Util.print_dyn_progress(Util.format_progress(i, cnt_gridfs_ids)) # use apk to extract data from dict fastapk = FastApk.load_from_result_dict(gridfs_entry_meta) # get filename file_name = gridfs_entry_raw.filename # write results to disk try: self.store_custom_data(fastapk.package_name, fastapk.version_name, fastapk.hash, file_name, gridfs_entry_raw.read()) except FileSysStoreException as e: log.exception(e) # update shared memory progress indicitor if use_shared_memory: with synced_entries.get_lock(): synced_entries.value += 1 if not nice_progess: log.debug("fetching %d documents (non-gridfs) ... ", cnt_non_gridfs_ids) for i, non_gridfs_entry in enumerate(non_gridfs_entries, 1): if not nice_progess: clilog.debug("getting results for %s" % non_gridfs_entry[RESOBJ_APK_META][RESOBJ_APK_META_PACKAGE_NAME]) else: Util.print_dyn_progress(Util.format_progress(i, cnt_non_gridfs_ids)) # write results to disk self.store_result_dict(non_gridfs_entry) # update shared memory progress indicitor if use_shared_memory: with synced_entries.get_lock(): synced_entries.value += 1 # if not wait for db wanted stop here results_stored = True or not wait_for_db except (DatabaseLoadException, PyMongoError) as e: if not wait_for_db: raise log.warn(e) Util.log_will_retry(DATABASE_RETRY_TIME, exc = e) sleep(DATABASE_RETRY_TIME)
def import_scripts(script_list, via_package = False, _reload = False, clazz_name = None): ''' Import the scripts (via file path or package name - configurable via `via_pacakge`). Parameters ---------- script_list: list<str> list of script names (absolute path) or package names. via_package : bool, optional (default is False) If true, assume package names are given instead of file paths. _reload : bool, optional (default is False) Reload scripts and delete them from internal cache. Only possible if `via_package`. clazz_name : optional (default is None) The name of the class to import. If none, use the name of the module. Returns ------- list<type<AndroScript>> list of uninstantiated AndroScript classes Raises ------ AnalyzeError If an NoAndroScriptSubclass, IOError or ModuleNotSameClassNameException has been raised. ImportError ''' # late import -> pervent recursive import from androlyze.model.script.AndroScript import AndroScript from androlyze.analyze.exception import AnalyzeError androscripts = [] # reload scripts if wanted if via_package and _reload: for script_package in script_list: log.debug("deleting %s from system modules", script_package) try: del sys.modules[script_package] log.debug("deleted") except KeyError: pass for script in script_list: class_name = clazz_name if not class_name: if via_package: class_name = script.split(".")[-1] else: class_name = basename(script.split(".py")[0]) # class name must be equivalent to the module name! try: module_package = script # get package name from path and cut off file extension if not via_package: module_package = Util.path_2_package_name(script) module = importlib.import_module(module_package) clazz = getattr(module, class_name) # check if class is derived from AndroScript if isinstance(clazz, AndroScript.__class__): androscripts.append(clazz) else: raise NoAndroScriptSubclass(clazz), None, sys.exc_info()[2] except AttributeError as e: raise ModuleNotSameClassNameException(script, class_name), None, sys.exc_info()[2] except IOError as e: e.filename = script raise except (NoAndroScriptSubclass, ModuleNotSameClassNameException, IOError) as e: raise AnalyzeError(e), None, sys.exc_info()[2] return androscripts
def run_action(self, cmd): ''' Run an action specified by `cmd`(see COMMAND_ prefixed variables) ''' parser = self.parser args = self.args # check which command has been used if cmd is None: # no command specified through program name -> get it from argparser cmd = args.command if cmd in COMMANDS_ALL: hashes, package_names, tags = CLIUtil.get_filter_options_from_cli(args) yes = args.yes if cmd == COMMAND_QUERY: self.action_query(hashes, package_names, tags, yes) # dblyze -> do the analysis results evaluation elif cmd == COMMAND_EVAL: dblyze_scripts = ScriptUtil.import_scripts(args.scripts, clazz_name = "Eval") for dblyze_script in dblyze_scripts: dblyze_script().evaluate(self.storage) # sync from result db to file sys elif cmd == COMMAND_SYNC: total_entries = androlyze.action_sync_fs(self.storage, lambda _ : False) CLIUtil.cli_check_n_exec(androlyze.action_sync_fs, prompt_prefix = "Will download %d entries from result database!" % total_entries, circumvent_check = args.yes, args = (self.storage, lambda _ : True) ) else: # print welcome message clilog.info("Welcome to %s!\n" % PROJECT_NAME) # import command if cmd == COMMAND_IMPORT: apks_or_paths, _ = self.get_apks_or_paths_from_cli() tag = args.tag copy2disk, copy2db, update, concurrency = args.copy_disk, args.copy_db, args.update, args.concurrency if not update: log.warn('''--update not supplied. No update of already present apks in database will be done!''') androlyze.action_import_apks(self.storage, apks_or_paths, copy2disk, copy2db, update, tag, concurrency = concurrency) # analyze command elif cmd == COMMAND_ANALYZE: # androguard path has to be set before from androlyze import action_analyze # sort apks ? get_apks_kwargs = {} no_sort_by_code_size = args.no_sort_code_size if not no_sort_by_code_size: # sort apks by app code size for better scheduling get_apks_kwargs = dict(order_by = TABLE_APK_IMPORT_KEY_SIZE_APP_CODE, ascending = False) apks_or_paths, _ = self.get_apks_or_paths_from_cli(**get_apks_kwargs) # debug infos if not no_sort_by_code_size and not args.apks: apks_or_paths, _it = itertools.tee(apks_or_paths) clilog.info('Using Code Size Scheduling for faster analysis!') log.debug('\n'.join(('%s: %s' % (x.package_name, x.size_app_code) for x in _it))) scripts = args.scripts parallel_mode, concurrency, send_id = self.__load_parallel_settings() # get analysis mode analyze_mode = None if parallel_mode == PARALLELIZATION_MODE_DISTRIBUTED: analyze_mode = ANALYZE_MODE_DISTRIBUTED elif parallel_mode == PARALLELIZATION_MODE_NON_PARALLEL: analyze_mode = ANALYZE_MODE_NON_PARALLEL else: analyze_mode = ANALYZE_MODE_PARALLEL action_analyze(self.storage, scripts, apks_or_paths, mode = analyze_mode, concurrency = concurrency, serialize_apks = not send_id) # delete command elif cmd == COMMAND_DELETE: self.action_delete(parser, hashes, package_names, tags, yes) clilog.info("done")
def action_import_apks(storage, apk_paths, copy_apk = False, copy_to_mongodb = False, update = False, tag = None, # shared memory cnt_imported_apks = None, total_apk_count = None, import_finished = None, # concurrent settings concurrency = None ): ''' Import the apks from the `apk_paths` and create the file system structure where the results will be kept, specified by `storage`. Parameters ---------- storage : RedundantStorage The store to use. apk_paths : iterable<str> The apk files and/or directories. copy_apk : bool Import the apk file to the `import_dir` (copy it). copy_to_mongodb : bool, optional (default is False) Also import into MongoDB. Useful for the distributed analysis. update : bool Update apks that have already been imported. tag : str, optional (default is None) Some tag cnt_imported_apks : multiprocessing.Value<int>, optional (default is None) If given, use for progress updating. total_apk_count : multiprocessing.Value<int>, optional (default is None) If given, use for total count of apks. import_finished : multiprocessing.Value<byte>, optional (default is None) If given, use to signal that import has been completed. concurrency : int, optional (default is number of cpus) Number of processes to use for the import. ''' from androlyze.loader.ApkImporter import ApkImporter # get single paths to apks so we get the correct total count of apks clilog.info("looking for apks in given paths ... ") apk_paths = ApkImporter.get_apks_from_list_or_dir(apk_paths) if total_apk_count is not None: # may be time consuming for recursive lookup apk_paths, total_apk_count.value = Util.count_iterable_n_clone(apk_paths) # create count if not given if cnt_imported_apks is None: cnt_imported_apks = Value('i', 0, lock = RLock()) # set concurrency if concurrency is None: concurrency = cpu_count() log.warn("Using %d processes", concurrency) clilog.info("Storage dir is %s" % storage.fs_storage.store_root_dir) if copy_apk: clilog.info("Copying APKs to %s ..." % storage.fs_storage.store_root_dir) def import_apks(apk_paths): apk_importer = ApkImporter(apk_paths, storage) for apk in apk_importer.import_apks(copy_apk = copy_apk, copy_to_mongodb = copy_to_mongodb, update = update, tag = tag): clilog.info("imported %s", apk.short_description()) # use shared memory counter if given if cnt_imported_apks is not None: with cnt_imported_apks.get_lock(): cnt_imported_apks.value += 1 pool = [] # don't convert generator to list if only 1 process wanted apk_paths = [apk_paths] if concurrency == 1 else Util.split_n_uniform_distri(list(apk_paths), concurrency) # start parallel import # multiprocessing's pool causes pickle errors for i in range(concurrency): p = Process(target = import_apks, args = (apk_paths[i], )) log.debug("starting process %s", p) pool.append(p) p.start() for it in pool: log.debug("joined on process %s", p) it.join() apks_imported = cnt_imported_apks.value != 0 # show some message that no APK has been imported if not apks_imported: log.warn("No .apk file has been imported! This means no .apk file has been found or they already have been imported.") else: clilog.info("done") # because not all apks may be importable, we cannot use we count for signal that the import is done if import_finished is not None: import_finished.value = 1 clilog.info("Imported %d apks", cnt_imported_apks.value)
def _analyze(self): ''' See doc of :py:method:`.BaseAnalyzer.analyze`. ''' # try to get registered workers # it network fails at this point -> stop analysis try: clilog.info(CeleryUtil.get_workers_and_check_network()) except NetworkError as e: log.critical(e) return 0 # storage objects storage = self.storage clilog.info("Number of apks to analyze: %d", self._cnt_apks) try: # get analyze task analyze_task = tasks[CeleryConstants.get_analyze_task_name()] # create storage storage.create_or_open_sub_storages() # send tasks start = time() # apk generator over .apk or apk hashes apk_gen = AnalyzeUtil.apk_id_or_raw_data_gen(self.apks, force_raw_data = self.serialize_apks) clilog.info("Task publishing progress:") # send and serialize .apks # if analysis via path serialize them! if self.serialize_apks: log.info("sending .apks to message broker") self.group_result = group_result = GroupResult(results = []) for args in self.send_apk_args_generator(apk_gen): task = analyze_task.delay(*args) group_result.add(task) # send only apk id and let fetch via mongodb else: log.info("sending ids of apks") task_group = group((analyze_task.s(*args) for args in self.send_id_args_generator(apk_gen))) # publish tasks self.group_result = task_group() log.info("sending took %ss", (time() - start)) sys.stderr.write("\nAnalysis progress:\n") # start showing analysis progress self.analyze_stats_view.start() # wait for results log.debug("joining on ResultGroup ... ") # setup callback callback_func = self.get_callback_func(self.success_handler, self.error_handler) CeleryUtil.join_native(self.group_result, propagate = False, callback = callback_func) clilog.info("\nanalysis done ... ") log.info("distributed analysis took %ss", (time() - start)) return self.stop_analysis_view() except DatabaseOpenError as e: log.critical(e) return 0 except (KeyboardInterrupt, Exception) as e: if not isinstance(e, KeyboardInterrupt): log.exception(e) log.warn("Interrupting distributed analysis ... Please wait a moment!") log.warn("revoking tasks on all workers ...") if celerysettings.CELERY_TASK_REVOCATION_ENABLED: # revoke tasks if self.group_result is None: # revoke via task ids log.debug("revoking while publishing tasks ...") self.task_collection.revoke_all(terminate = True, signal = 'SIGKILL') else: # revoke via GroupResult if yet available/created # first available after all tasks have been send self.group_result.revoke(terminate = True, signal = 'SIGKILL') log.warn("revoked tasks and killed workers ...") #return number of analyzed apks return self.stop_analysis_view()