def create_where_clause(self, kwargs, from_gridfs = False):
        '''
        Create where clause from `kwargs`.

        Parameters
        ----------
        from_gridfs : bool, optional (default is False)
            Whether to build where clause for gridfs.

        Other Parameters
        ----------------
        package_name : str, optional (default is None)
        apk_hash : str, optional (default is None)
        version_name : str, optional (default is None)
        tag : str, optional (default is None)

        script_hash : str, optional (default is None)
        script_name : str, optional (default is None)
        script_version : str, optional (default is None)

        Notes
        -----
        If any of the other parameters is None it won't be used for filtering.

        Returns
        -------
        '''
        # create filter dict
        wheres = []
        wheres += MongoUtil.build_apk_meta_where(kwargs, gridfs = from_gridfs)
        wheres += MongoUtil.build_script_meta_where(kwargs, gridfs = from_gridfs)
        return dict(wheres)
 def create_idx(coll):
     ''' Create index on a single collection '''
     # apk meta
     coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_APK_META_PACKAGE_NAME, gridfs = False), -1)])
     coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_APK_META_BUILD_DATE, gridfs = False), -1)])
     # script meta
     coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_ANALYSIS_DATE, gridfs = False), -1)])
     coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_NAME, gridfs = False), -1)])
Exemple #3
0
    def action_query(self, hashes, package_names, tags, yes):
        ''' Query the database '''
        args = self.args
        parser = self.parser

        # check on which database to query
        # get from argparser
        query_dst = args.query_dst
        if query_dst == SUBCOMMAND_QUERY_IMPORT:
            clilog.info('\n'.join(androlyze.action_query_import_db(self.storage, args.query_import_cmd, hashes, package_names, tags)))
        elif query_dst == SUBCOMMAND_QUERY_RESULT:
            kwargs = CLIUtil.get_result_db_filter_args_from_argparser(args)
            if args.show_id:
                kwargs["remove_id_field"] = not args.show_id

            distinct_key = None
            if args.distinct is not None:
                distinct_key = args.distinct
            # get distinct values for script name
            elif args.list_ran_scripts:
                distinct_key = MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_NAME, args.non_document)

            no_args_supplied = len(kwargs) == 0 and not args.latest and not args.count and distinct_key is None
            whole_db = args.all
            raw = args.raw

            # update with basic result query options
            kwargs.update(CLIUtil.get_basic_result_query_options(args))

            kwargs.update(dict(include_fields=args.include_fields, exclude_fields=args.exclude_fields, non_document_raw=raw, distinct_key = distinct_key))

            if no_args_supplied and not whole_db:
                raise CLIError('Not enough arguments supplied!\nIf you want to dump the whole db, use the --all switch!', parser)

            res = cli_check_n_exec(
                androlyze.action_query_result_db,
                prompt_prefix='Will print whole results db!',
                circumvent_check=not no_args_supplied or yes,
                args=(self.storage, CLIUtil.get_checks_from_cli(args)),
                kwargs=kwargs)

            # log results
            print_query_result_db(res, distict_generator=distinct_key is not None, count=args.count, raw=raw, interactive = not args.not_interactive)
Exemple #4
0
def action_query_result_db(storage, checks = {}, **kwargs):
    '''
    Get results from the database.

    Parameters
    ----------
    storage : ResultsStorageInterface
        The store to use.
    checks : dict, optional (default is {})
        Dictionary describing the checks to perform on some values.
        Will be passed to :py:method:`.MongoUtil.build_checks_filter` (as keyword arguments)
        checks_non_empty_list : iterable<str>, optional (default is ())
            Check the keys against a non empty list.
        checks_empty_list : iterable<str>, optional (default is ())
            Check the keys against an empty list.
        checks_true : iterable<str>, optional (default is ())
            Check if the values of the given keys are true.
        checks_false : iterable<str>, optional (default is ())
            Check if the values of the given keys are false.
        checks_not_null : iterable<str>, optional (default is ())
            Check if the values of the given keys are null (python None).
        checks_null : iterable<str>, optional (default is ())
            Check if the values of the given keys are not null (python None).
        conjunction : str, optional (default is 'or')
            Choose between 'or' and 'and'.
            Specifies how to to link the checks together.

    Other Parameters
    ----------------
    include_fields : list<str>, optional (default is [])
        List of fields to include in the result.
        Mutually exclusive with `exclude_fields`.
    exclude_fields : list<str>, optional (default is [])
        List of fields to exclude from the result.
        Mutually exclusive with `include_fields`.

    where : dict, optional (default is {})
        A filter.
    remove_id_field : bool, optional (default is True)
        Will remove the `_id` field by default.

    distinct_key : str, optional (default is None)
        If given, list the distinct values for the `distinct_key.
    list_ran_scripts: bool, optional (default is False)
        List all scripts that have been run on the given selection.
        Normally you want to supply the `package_name`.
        Overrides `distinct_key`.

    sort : bool, optional (default is True)
        If true sort by analysis date.
    latest : bool, optional (default is False)
        Get the result of the latest script run.
        Will only return one result.
    n : int, optional (default is None)
        Number of results to return.
        None means no limit.

    non_document : bool, optional (default is False)
        Get custom data from mongodb's gridfs.
    non_document_raw : bool, optional (default is False)
        Get the raw data from the database. Otherwise meta infos will be returned.
        Only interesting if `non_document`.

    package_name : str, optional (default is None)
    apk_hash : str, optional (default is None)
    version_name : str, optional (default is None)
    tag : str, optional (default is None)

    script_hash : str, optional (default is None)
    script_name : str, optional (default is None)
    script_version : str, optional (default is None)

    Notes
    -----
    If any of the other parameters is None it won't be used for filtering.

    Returns
    -------
    gridfs.grid_file.GridOutCursor
        If non_document and non_document_raw.
    pymongo.cursor.Cursor
        Otherwise

    Raises
    ------
    DatabaseLoadException

    Examples
    --------
    >>> import androlyzelab
    ... from androlyze.storage.resultdb.ResultDatabaseStorage import ResultDatabaseStorage
    ... from androlyze.model.script.ScriptUtil import dict2json
    ... storage = ResultDatabaseStorage('127.0.0.1', 27017)
    ... res = androlyze.action_query_result_db(storage, n = 2, script_name = "ChainedApkInfos", include_fields = ["apkinfo.components.activities"])
    ... for r in res:
    ...     # get dict
    ...     # print r
    ...     # get json
    ...     print dict2json(r)
    {
    "apkinfo": {
        "components": {
            "activities": {
                "all": [
                    "cn.wps.impress.test.selfvalidate.lmj.TestServiceActivity",
    ...
    '''
    # build check filter dict if some checks are given which shall be done on some attributes
    if checks:
        checks = MongoUtil.build_checks_filter(**checks)

    # update with checks dict or {}
    if 'where' in kwargs and kwargs['where'] is not None:
        kwargs['where'].update(checks)
    else:
        kwargs['where'] = checks

    non_document = kwargs.get("non_document", False)
    if kwargs.get("list_ran_scripts", False):
        kwargs['distinct_key'] = MongoUtil.get_attr_str(StaticResultKeys.RESOBJ_SCRIPT_META, StaticResultKeys.RESOBJ_SCRIPT_META_NAME, non_document)

    return storage.get_results(**kwargs)
    def get_results(self,
                    include_fields = None, exclude_fields = None,
                    where = None, distinct_key = None,
                    n = None, sort = True, latest = False,
                    non_document = False, non_document_raw = False,
                    remove_id_field = True,
                    **kwargs):
        ''' See doc of :py:meth:`.ResultStorageInterface.get_results` '''

        if include_fields is not None and exclude_fields is not None:
            raise ValueError("include_fields and exclude_fields are mutually exclusive!")

        if include_fields is None:
            include_fields = []
        if exclude_fields is None:
            exclude_fields = []
        if where is None:
            where = {}

        # latest means enable sorting and only return one result
        if latest:
            sort = True
            n = 1

        # create projection dict
        fields = [(p, 0) for p in exclude_fields] + [(p, 1) for p in include_fields]

        if remove_id_field:
            # we don't want the id field
            fields += [(RESOBJ_ID, 0)]

        select = dict(fields)

        # no projection criteria given, disable!
            # because empty dict means only id
        if not select:
            select = None

        where.update(self.create_where_clause(kwargs, from_gridfs = non_document))

        try:
            res_cursor = None
            # get appropriate collection
            coll = self.__get_collection(gridfs_files_coll = non_document and not non_document_raw,
                                         gridfs_obj = non_document and non_document_raw)

            # pymongo 3.0 removed the as_class option in the collection.find method
            # this is the fix
            find_kwargs = {}
            if int(pymongo.version[0]) < 3:
                find_kwargs['as_class'] = OrderedDict
                
            # grid fs
            if non_document:
                if non_document_raw:
                    log.debug("mongodb query: find(%s) on gridfs", where)
                    res_cursor = coll.find(where)
                else:
                    # using the gridfs files collection directly enables us projection an attributes
                    log.debug("mongodb query: find(%s, %s) ", where, select)
                    res_cursor = coll.find(where, select, **find_kwargs)

            # normal collection
            else:
                res_cursor = coll.find(where, select, **find_kwargs)
                log.debug("mongodb query: find(%s, %s) ", where, select)


            # enable sorting if wanted
            if sort:
                # construct sorting criteria structure, structure is different if using gridfs
                sort_crit = [(
                  MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_ANALYSIS_DATE, gridfs=non_document)
                  , -1)]
                res_cursor = res_cursor.sort(sort_crit)

            # limit results if wanted
            if n is not None:
                res_cursor = res_cursor.limit(n)

            # generator that abstracts if normal collection or is gridfs
            if non_document:
                if non_document_raw:
                    return res_cursor

            if distinct_key is not None:
                res_cursor = res_cursor.distinct(distinct_key)

            return res_cursor

        except PyMongoError as e:
            raise DatabaseLoadException(self, "find(%s, %s)", where, select, caused_by = e), None, sys.exc_info()[2]
Exemple #6
0
    def fetch_results_from_mongodb(self, rds, results, wait_for_db = True,
                                   # progress
                                   nice_progess = False, synced_entries = None, total_sync_entries = None):
        '''
        Fetch some results from the result database and write them to disk.

        If data cannot be loaded from db, try until it can be.

        Parameters
        ----------
        rds : ResultDatabaseStorage
            The database to query for the results.
        results : list< tuple<id, gridfs (bool)> >
            Define which results shall be fetched.
        wait_for_db : bool, optional (default is True)
            Wait until data could be fetched from db.
        nice_progess : bool, optional (default is False)
            If enabled update show some nice progress bar on the cli.
        synced_entries : multiprocessing.Value<int>, optional (default is None)
            If supplied store number of already synces entries.
        total_sync_entries : multiprocessing.Value<int>, optional (default is None)
            If supplied store number of total entries to sync.

        Raises
        ------
        DatabaseLoadException
            If `wait_for_db` is False and an error occurred.
        '''
        # retry in ... seconds
        DATABASE_RETRY_TIME = 5

        # if true assume both counts are shared memory (Value)
        use_shared_memory = synced_entries is not None and total_sync_entries is not None

        if results is not None:
            results_stored = False
            while not results_stored:
                try:
                    # get ids
                    non_gridfs_ids, gridfs_ids = MongoUtil.split_result_ids(results)

                    # counts
                    cnt_non_gridfs_ids = len(non_gridfs_ids)
                    cnt_gridfs_ids = len(gridfs_ids)

                    if use_shared_memory:
                        total_sync_entries.value = cnt_gridfs_ids + cnt_non_gridfs_ids

                    # gridfs raw data as well as metadata
                    gridfs_entries_raw = []
                    if gridfs_ids:
                        gridfs_entries_raw = rds.get_results_for_ids(gridfs_ids, non_document = True, non_document_raw = True)

                    # regular documents (non gridfs)
                    non_gridfs_entries = []
                    if non_gridfs_ids:
                        non_gridfs_entries = rds.get_results_for_ids(non_gridfs_ids, non_document = False, non_document_raw = True)

                    if not nice_progess:
                        log.debug("fetching %d non-documents (gridfs) ... ", cnt_gridfs_ids)

                    for i, gridfs_entry_raw in enumerate(gridfs_entries_raw, 1):

                        # get our stored metadata (for script and apk)
                        gridfs_entry_meta = gridfs_entry_raw.metadata

                        if not nice_progess:
                            log.debug("getting results for %s", gridfs_entry_meta[RESOBJ_APK_META][RESOBJ_APK_META_PACKAGE_NAME])
                        else:
                            Util.print_dyn_progress(Util.format_progress(i, cnt_gridfs_ids))

                        # use apk to extract data from dict
                        fastapk = FastApk.load_from_result_dict(gridfs_entry_meta)
                        # get filename
                        file_name = gridfs_entry_raw.filename

                        # write results to disk
                        try:
                            self.store_custom_data(fastapk.package_name, fastapk.version_name, fastapk.hash, file_name, gridfs_entry_raw.read())
                        except FileSysStoreException as e:
                            log.exception(e)

                        # update shared memory progress indicitor
                        if use_shared_memory:
                            with synced_entries.get_lock():
                                synced_entries.value += 1

                    if not nice_progess:
                        log.debug("fetching %d documents (non-gridfs) ... ", cnt_non_gridfs_ids)

                    for i, non_gridfs_entry in enumerate(non_gridfs_entries, 1):
                        if not nice_progess:
                            clilog.debug("getting results for %s" % non_gridfs_entry[RESOBJ_APK_META][RESOBJ_APK_META_PACKAGE_NAME])
                        else:
                            Util.print_dyn_progress(Util.format_progress(i, cnt_non_gridfs_ids))

                        # write results to disk
                        self.store_result_dict(non_gridfs_entry)

                        # update shared memory progress indicitor
                        if use_shared_memory:
                            with synced_entries.get_lock():
                                synced_entries.value += 1

                    # if not wait for db wanted stop here
                    results_stored = True or not wait_for_db

                except (DatabaseLoadException, PyMongoError) as e:
                    if not wait_for_db:
                        raise
                    log.warn(e)
                    Util.log_will_retry(DATABASE_RETRY_TIME, exc = e)
                    sleep(DATABASE_RETRY_TIME)