def create_where_clause(self, kwargs, from_gridfs = False): ''' Create where clause from `kwargs`. Parameters ---------- from_gridfs : bool, optional (default is False) Whether to build where clause for gridfs. Other Parameters ---------------- package_name : str, optional (default is None) apk_hash : str, optional (default is None) version_name : str, optional (default is None) tag : str, optional (default is None) script_hash : str, optional (default is None) script_name : str, optional (default is None) script_version : str, optional (default is None) Notes ----- If any of the other parameters is None it won't be used for filtering. Returns ------- ''' # create filter dict wheres = [] wheres += MongoUtil.build_apk_meta_where(kwargs, gridfs = from_gridfs) wheres += MongoUtil.build_script_meta_where(kwargs, gridfs = from_gridfs) return dict(wheres)
def create_idx(coll): ''' Create index on a single collection ''' # apk meta coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_APK_META_PACKAGE_NAME, gridfs = False), -1)]) coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_APK_META_BUILD_DATE, gridfs = False), -1)]) # script meta coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_ANALYSIS_DATE, gridfs = False), -1)]) coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_NAME, gridfs = False), -1)])
def action_query(self, hashes, package_names, tags, yes): ''' Query the database ''' args = self.args parser = self.parser # check on which database to query # get from argparser query_dst = args.query_dst if query_dst == SUBCOMMAND_QUERY_IMPORT: clilog.info('\n'.join(androlyze.action_query_import_db(self.storage, args.query_import_cmd, hashes, package_names, tags))) elif query_dst == SUBCOMMAND_QUERY_RESULT: kwargs = CLIUtil.get_result_db_filter_args_from_argparser(args) if args.show_id: kwargs["remove_id_field"] = not args.show_id distinct_key = None if args.distinct is not None: distinct_key = args.distinct # get distinct values for script name elif args.list_ran_scripts: distinct_key = MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_NAME, args.non_document) no_args_supplied = len(kwargs) == 0 and not args.latest and not args.count and distinct_key is None whole_db = args.all raw = args.raw # update with basic result query options kwargs.update(CLIUtil.get_basic_result_query_options(args)) kwargs.update(dict(include_fields=args.include_fields, exclude_fields=args.exclude_fields, non_document_raw=raw, distinct_key = distinct_key)) if no_args_supplied and not whole_db: raise CLIError('Not enough arguments supplied!\nIf you want to dump the whole db, use the --all switch!', parser) res = cli_check_n_exec( androlyze.action_query_result_db, prompt_prefix='Will print whole results db!', circumvent_check=not no_args_supplied or yes, args=(self.storage, CLIUtil.get_checks_from_cli(args)), kwargs=kwargs) # log results print_query_result_db(res, distict_generator=distinct_key is not None, count=args.count, raw=raw, interactive = not args.not_interactive)
def action_query_result_db(storage, checks = {}, **kwargs): ''' Get results from the database. Parameters ---------- storage : ResultsStorageInterface The store to use. checks : dict, optional (default is {}) Dictionary describing the checks to perform on some values. Will be passed to :py:method:`.MongoUtil.build_checks_filter` (as keyword arguments) checks_non_empty_list : iterable<str>, optional (default is ()) Check the keys against a non empty list. checks_empty_list : iterable<str>, optional (default is ()) Check the keys against an empty list. checks_true : iterable<str>, optional (default is ()) Check if the values of the given keys are true. checks_false : iterable<str>, optional (default is ()) Check if the values of the given keys are false. checks_not_null : iterable<str>, optional (default is ()) Check if the values of the given keys are null (python None). checks_null : iterable<str>, optional (default is ()) Check if the values of the given keys are not null (python None). conjunction : str, optional (default is 'or') Choose between 'or' and 'and'. Specifies how to to link the checks together. Other Parameters ---------------- include_fields : list<str>, optional (default is []) List of fields to include in the result. Mutually exclusive with `exclude_fields`. exclude_fields : list<str>, optional (default is []) List of fields to exclude from the result. Mutually exclusive with `include_fields`. where : dict, optional (default is {}) A filter. remove_id_field : bool, optional (default is True) Will remove the `_id` field by default. distinct_key : str, optional (default is None) If given, list the distinct values for the `distinct_key. list_ran_scripts: bool, optional (default is False) List all scripts that have been run on the given selection. Normally you want to supply the `package_name`. Overrides `distinct_key`. sort : bool, optional (default is True) If true sort by analysis date. latest : bool, optional (default is False) Get the result of the latest script run. Will only return one result. n : int, optional (default is None) Number of results to return. None means no limit. non_document : bool, optional (default is False) Get custom data from mongodb's gridfs. non_document_raw : bool, optional (default is False) Get the raw data from the database. Otherwise meta infos will be returned. Only interesting if `non_document`. package_name : str, optional (default is None) apk_hash : str, optional (default is None) version_name : str, optional (default is None) tag : str, optional (default is None) script_hash : str, optional (default is None) script_name : str, optional (default is None) script_version : str, optional (default is None) Notes ----- If any of the other parameters is None it won't be used for filtering. Returns ------- gridfs.grid_file.GridOutCursor If non_document and non_document_raw. pymongo.cursor.Cursor Otherwise Raises ------ DatabaseLoadException Examples -------- >>> import androlyzelab ... from androlyze.storage.resultdb.ResultDatabaseStorage import ResultDatabaseStorage ... from androlyze.model.script.ScriptUtil import dict2json ... storage = ResultDatabaseStorage('127.0.0.1', 27017) ... res = androlyze.action_query_result_db(storage, n = 2, script_name = "ChainedApkInfos", include_fields = ["apkinfo.components.activities"]) ... for r in res: ... # get dict ... # print r ... # get json ... print dict2json(r) { "apkinfo": { "components": { "activities": { "all": [ "cn.wps.impress.test.selfvalidate.lmj.TestServiceActivity", ... ''' # build check filter dict if some checks are given which shall be done on some attributes if checks: checks = MongoUtil.build_checks_filter(**checks) # update with checks dict or {} if 'where' in kwargs and kwargs['where'] is not None: kwargs['where'].update(checks) else: kwargs['where'] = checks non_document = kwargs.get("non_document", False) if kwargs.get("list_ran_scripts", False): kwargs['distinct_key'] = MongoUtil.get_attr_str(StaticResultKeys.RESOBJ_SCRIPT_META, StaticResultKeys.RESOBJ_SCRIPT_META_NAME, non_document) return storage.get_results(**kwargs)
def get_results(self, include_fields = None, exclude_fields = None, where = None, distinct_key = None, n = None, sort = True, latest = False, non_document = False, non_document_raw = False, remove_id_field = True, **kwargs): ''' See doc of :py:meth:`.ResultStorageInterface.get_results` ''' if include_fields is not None and exclude_fields is not None: raise ValueError("include_fields and exclude_fields are mutually exclusive!") if include_fields is None: include_fields = [] if exclude_fields is None: exclude_fields = [] if where is None: where = {} # latest means enable sorting and only return one result if latest: sort = True n = 1 # create projection dict fields = [(p, 0) for p in exclude_fields] + [(p, 1) for p in include_fields] if remove_id_field: # we don't want the id field fields += [(RESOBJ_ID, 0)] select = dict(fields) # no projection criteria given, disable! # because empty dict means only id if not select: select = None where.update(self.create_where_clause(kwargs, from_gridfs = non_document)) try: res_cursor = None # get appropriate collection coll = self.__get_collection(gridfs_files_coll = non_document and not non_document_raw, gridfs_obj = non_document and non_document_raw) # pymongo 3.0 removed the as_class option in the collection.find method # this is the fix find_kwargs = {} if int(pymongo.version[0]) < 3: find_kwargs['as_class'] = OrderedDict # grid fs if non_document: if non_document_raw: log.debug("mongodb query: find(%s) on gridfs", where) res_cursor = coll.find(where) else: # using the gridfs files collection directly enables us projection an attributes log.debug("mongodb query: find(%s, %s) ", where, select) res_cursor = coll.find(where, select, **find_kwargs) # normal collection else: res_cursor = coll.find(where, select, **find_kwargs) log.debug("mongodb query: find(%s, %s) ", where, select) # enable sorting if wanted if sort: # construct sorting criteria structure, structure is different if using gridfs sort_crit = [( MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_ANALYSIS_DATE, gridfs=non_document) , -1)] res_cursor = res_cursor.sort(sort_crit) # limit results if wanted if n is not None: res_cursor = res_cursor.limit(n) # generator that abstracts if normal collection or is gridfs if non_document: if non_document_raw: return res_cursor if distinct_key is not None: res_cursor = res_cursor.distinct(distinct_key) return res_cursor except PyMongoError as e: raise DatabaseLoadException(self, "find(%s, %s)", where, select, caused_by = e), None, sys.exc_info()[2]
def fetch_results_from_mongodb(self, rds, results, wait_for_db = True, # progress nice_progess = False, synced_entries = None, total_sync_entries = None): ''' Fetch some results from the result database and write them to disk. If data cannot be loaded from db, try until it can be. Parameters ---------- rds : ResultDatabaseStorage The database to query for the results. results : list< tuple<id, gridfs (bool)> > Define which results shall be fetched. wait_for_db : bool, optional (default is True) Wait until data could be fetched from db. nice_progess : bool, optional (default is False) If enabled update show some nice progress bar on the cli. synced_entries : multiprocessing.Value<int>, optional (default is None) If supplied store number of already synces entries. total_sync_entries : multiprocessing.Value<int>, optional (default is None) If supplied store number of total entries to sync. Raises ------ DatabaseLoadException If `wait_for_db` is False and an error occurred. ''' # retry in ... seconds DATABASE_RETRY_TIME = 5 # if true assume both counts are shared memory (Value) use_shared_memory = synced_entries is not None and total_sync_entries is not None if results is not None: results_stored = False while not results_stored: try: # get ids non_gridfs_ids, gridfs_ids = MongoUtil.split_result_ids(results) # counts cnt_non_gridfs_ids = len(non_gridfs_ids) cnt_gridfs_ids = len(gridfs_ids) if use_shared_memory: total_sync_entries.value = cnt_gridfs_ids + cnt_non_gridfs_ids # gridfs raw data as well as metadata gridfs_entries_raw = [] if gridfs_ids: gridfs_entries_raw = rds.get_results_for_ids(gridfs_ids, non_document = True, non_document_raw = True) # regular documents (non gridfs) non_gridfs_entries = [] if non_gridfs_ids: non_gridfs_entries = rds.get_results_for_ids(non_gridfs_ids, non_document = False, non_document_raw = True) if not nice_progess: log.debug("fetching %d non-documents (gridfs) ... ", cnt_gridfs_ids) for i, gridfs_entry_raw in enumerate(gridfs_entries_raw, 1): # get our stored metadata (for script and apk) gridfs_entry_meta = gridfs_entry_raw.metadata if not nice_progess: log.debug("getting results for %s", gridfs_entry_meta[RESOBJ_APK_META][RESOBJ_APK_META_PACKAGE_NAME]) else: Util.print_dyn_progress(Util.format_progress(i, cnt_gridfs_ids)) # use apk to extract data from dict fastapk = FastApk.load_from_result_dict(gridfs_entry_meta) # get filename file_name = gridfs_entry_raw.filename # write results to disk try: self.store_custom_data(fastapk.package_name, fastapk.version_name, fastapk.hash, file_name, gridfs_entry_raw.read()) except FileSysStoreException as e: log.exception(e) # update shared memory progress indicitor if use_shared_memory: with synced_entries.get_lock(): synced_entries.value += 1 if not nice_progess: log.debug("fetching %d documents (non-gridfs) ... ", cnt_non_gridfs_ids) for i, non_gridfs_entry in enumerate(non_gridfs_entries, 1): if not nice_progess: clilog.debug("getting results for %s" % non_gridfs_entry[RESOBJ_APK_META][RESOBJ_APK_META_PACKAGE_NAME]) else: Util.print_dyn_progress(Util.format_progress(i, cnt_non_gridfs_ids)) # write results to disk self.store_result_dict(non_gridfs_entry) # update shared memory progress indicitor if use_shared_memory: with synced_entries.get_lock(): synced_entries.value += 1 # if not wait for db wanted stop here results_stored = True or not wait_for_db except (DatabaseLoadException, PyMongoError) as e: if not wait_for_db: raise log.warn(e) Util.log_will_retry(DATABASE_RETRY_TIME, exc = e) sleep(DATABASE_RETRY_TIME)