Example #1
0
 def bucket_create(self, key, val, metadata_dict = {}):
     '''
     Create an object in the bucket, but only if not yet present (save traffic).
     
     Parameters
     ---------
     key : str
     val : file-like object 
     metadata_dict : dict
     
     Returns
     -------
     Key
     '''
     s3_key = Key(self.apk_bucket)
     
     s3_key.key = key
     # important: set metadata before actual upload
     s3_key.metadata = metadata_dict
     s3_key.content_type = 'application/vnd.android.package-archive'
     # upload
     log.debug("uploading %s", s3_key.key)
     s3_key.set_contents_from_file(val, replace = False)
     
     return s3_key
Example #2
0
 def get_hash(self):
     '''
     Get the sha256 message digest of the file
     and store it.
     
     Returns
     -------
     str
         sha256 message digest as hexstring
     None
         If path is None
         
     Raises
     ------
     OSError
         If the file could no be opened
     '''
     if self._get_hash() is None:
         if self.path is None:
             # cannot calculate message digest from file
             return None
         else:
             with open(self.path, "rb") as apkf:
                 self.hash = sha256(apkf.read())
                 log.debug("Calculated hash for %s by reading file %s",
                           self, self.path)
     return self._get_hash()
    def delete_results(self,
                       where = None, non_document = False, **kwargs):
        ''' See doc of :py:meth:`.ResultStorageInterface.delete_results` '''
        coll = self.__get_collection(gridfs_obj = non_document)

        if where is None:
            where = {}

        where.update(self.create_where_clause(kwargs, from_gridfs = non_document))

        n = 0
        try:
            # do the query
            log.debug("mongodb remove(%s)", where)

            # gridfs
            if non_document:
                # get ids and delete
                for _id in self.get_ids(where = where, non_document = non_document):
                    coll.delete(_id)
                    log.debug("Deleted element with id: %s from mongodb gridfs!", _id)
                    n += 1

            # normal collection
            else:
                write_result = coll.remove(where, getLastError=True)
                if write_result is not None:
                    n = write_result["n"]

            return n

        except PyMongoError as e:
            log.exception(DatabaseDeleteException(self, where, e))
            return n
Example #4
0
 def get_hash(self):
     '''
     Get the sha256 message digest of the file
     and store it.
     
     Returns
     -------
     str
         sha256 message digest as hexstring
     None
         If path is None
         
     Raises
     ------
     OSError
         If the file could no be opened
     '''
     if self._get_hash() is None:
         if self.path is None:
             # cannot calculate message digest from file
             return None
         else:
             with open(self.path, "rb") as apkf:
                 self.hash = sha256(apkf.read())
                 log.debug("Calculated hash for %s by reading file %s", self, self.path)
     return self._get_hash()
Example #5
0
    def bucket_create(self, key, val, metadata_dict={}):
        '''
        Create an object in the bucket, but only if not yet present (save traffic).
        
        Parameters
        ---------
        key : str
        val : file-like object 
        metadata_dict : dict
        
        Returns
        -------
        Key
        '''
        s3_key = Key(self.apk_bucket)

        s3_key.key = key
        # important: set metadata before actual upload
        s3_key.metadata = metadata_dict
        s3_key.content_type = 'application/vnd.android.package-archive'
        # upload
        log.debug("uploading %s", s3_key.key)
        s3_key.set_contents_from_file(val, replace=False)

        return s3_key
Example #6
0
    def __init__(self, config_filename, import_db = None):
        '''
        Parameters
        ----------
        config_filename : str, optional (default is `settings.CONFIG_PATH`)
            The path to the config to load.
        import_db : str, optional (default is read from config file)
            Path to the import db.
        '''
        # type: Settings
        if config_filename is None:
            config_filename = settings.CONFIG_PATH

        # create settings variable
        self.__settings = Settings(config_filename, default_path = settings.DEFAULTS_PATH)

        log.debug("config file settings: %s\n\tCLI options may overwrite them!", self.__settings)

        # load and set androguard path from configs
        Util.set_androguard_path(self.settings)

        # type: str
        import_db = self._get_import_db(import_db = import_db)
        #self.args.import_database
        log.info("Using import database: %s", import_db)

        # load a few other settings
        self.__storage = self._create_storage(import_db)
Example #7
0
    def __init__(self, *args, **kwargs):
        '''
        A task will be initialized for every process, but not for every task!
        '''
        Task.__init__(self, *args, **kwargs)
        self.__result_database_storage = None
        self.__apk_storage = None
        self.__script_hashes = None
        self.__androscripts = None

        # register signal to prefetch apks
        task_prerun.connect(self.prefetch_apk)

        log.debug("%s init", self)
Example #8
0
    def __init__(self, *args, **kwargs):
        '''
        A task will be initialized for every process, but not for every task!
        '''
        Task.__init__(self, *args, **kwargs)
        self.__result_database_storage = None
        self.__apk_storage = None
        self.__script_hashes = None
        self.__androscripts = None

        # register signal to prefetch apks
        task_prerun.connect(self.prefetch_apk)

        log.debug("%s init", self)
    def store_result_for_apk(self, apk, script):
        ''' See doc of :py:meth:`.ResultWritingInterface.store_result_for_apk`.

        Returns
        -------
        tuple<str, bool>
            First component is the id of the entry
            and the second a boolean indication if the result has been stored in gridfs.
        None
            If an error occurred.
        '''
        try:
            # escape keys for mongodb insert
            res_obj_dict = escape_keys(script.result_dict(gen_id = False))
            _id = script.gen_unique_id()

            # if data is to big or custom result object used -> store with gridfs
            if script.uses_custom_result_object() or script.is_big_res():
                log.debug("storing results for %s, %s in %s (id: %s)", apk.short_description(), script, self.grid_fs, _id)
                result = self.get_custom_res_obj_representation(script)

                gridfs = self.grid_fs

                # gridfs doesn't have an update method -> delete and insert
                if gridfs.exists(**{RESOBJ_ID : _id}):
                    # delete by _id
                    gridfs.delete(_id)

                # store file together with metadata from `ResultObject`
                gridfs.put(result, metadata = res_obj_dict, filename = script.get_file_name(), _id = _id)

                # return id
                return _id, True

            # normal json data
            else:
                log.debug("storing results for %s, %s in %s db(id: %s)", apk.short_description(), script, self.res_coll, _id)
                # set id so we don't have multiple results for same script and apk
                res_obj_dict[RESOBJ_ID] = _id
                # update or insert if not existing
                self.res_coll.update({RESOBJ_ID : _id}, res_obj_dict, upsert = True)
                # return id
                return _id, False
        except (PyMongoError, BSONError) as e:
            raise DatabaseStoreException(self, "script: %s" % script, caused_by = e), None, sys.exc_info()[2]
Example #10
0
    def copy_apk(self, apk, file_like_obj, **kwargs):
            '''
            Copy the `apk` to the file system (path specified through `store_root_dir`).

            See also: :py:meth:`.ApkCopyInterface.copy_apk`.

            Parameters
            ----------
            apk: Apk
                Holds meta information needed to create the subdirectory names.
            file_like_obj
                A file-like object which holds the .apk data

            Raises
            ------
            IOError
            FileSysCreateStorageStructureException

            Returns
            -------
            str
                The path were the apk file has been copied
            '''
            apk_file_path = self.get_apk_import_file_name(apk)
            log.debug("copying %s to %s", apk.short_description(), apk_file_path)

            # create path for apk if not existing
            apk_import_path = None
            try:
                apk_import_path = self.get_apk_import_path(apk)
                if not exists(apk_import_path):
                    makedirs(apk_import_path)
            except OSError as e:
                raise FileSysCreateStorageStructureException(apk_import_path, self, e), None, sys.exc_info()[2]

            # copy apk
            with open(apk_file_path, "wb") as apk_copy:
                file_like_obj.seek(0)
                apk_copy.write(file_like_obj.read())

            return apk_file_path
Example #11
0
        def callback(task_id, result_dict):
            '''
            Parameters
            ----------
            task_id : str
                UUID of task.
            result_dict : dict
                Dictionary holding the meta infos about the task as well as the result.
                See `CeleryConstants.CELERY_RESULT_BACKEND_*` for some available keys.
            '''
            log.debug("Task %s finished", task_id)

            result = result_dict[
                CeleryConstants.CELERY_RESULT_BACKEND_KEY_RESULT]
            traceback = result_dict[
                CeleryConstants.CELERY_RESULT_BACKEND_KEY_TRACEBACK]
            state = result_dict[
                CeleryConstants.CELERY_RESULT_BACKEND_KEY_STATUS]

            task_failed = state in states.EXCEPTION_STATES

            # show exceptions
            if task_failed:

                # handle error
                if handle_error is not None:
                    handle_error(task_id, result, state, traceback)

                # we need locking here because operation is not atomic
                with self.lock:
                    self.analyze_stats_view.failed_tasks += 1

            else:
                if handle_success is not None:
                    handle_success(task_id, result)

                # we need locking here because operation is not atomic
                with self.lock:
                    self.analyze_stats_view.successful_tasks += 1
Example #12
0
    def store_result_for_apk(self, apk, script):
        '''
        Store the results in the file system.

        If a custom result object is used in `script` and it's not a `ResultObject`,
        str(custom res object) will be used for writing to disk.

        Parameters
        ----------
        apk: Apk
        script: AndroScript

        Raises
        ------
        FileSysStoreException

        Returns
        -------
        str
            Path to result file.
        '''
        try:
            res_filename = self.get_apk_res_filename(apk, script)
            with open(res_filename, "w") as f:
                log.debug("storing results for %s, %s to %s", apk.short_description(), script, res_filename)
                if not script.uses_custom_result_object():
                    f.write(script.res.write_to_json())
                else:
                    res = self.get_custom_res_obj_representation(script)
                    # log json if custom res obj is `ResultObject
                    if ScriptUtil.is_result_object(res):
                        res = res.write_to_json()
                    f.write(res)
            return res_filename
        except IOError as e:
            raise FileSysStoreException(res_filename, str(apk), self, e)
        def callback(task_id, result_dict):
            '''
            Parameters
            ----------
            task_id : str
                UUID of task.
            result_dict : dict
                Dictionary holding the meta infos about the task as well as the result.
                See `CeleryConstants.CELERY_RESULT_BACKEND_*` for some available keys.
            '''
            log.debug("Task %s finished", task_id)

            result = result_dict[CeleryConstants.CELERY_RESULT_BACKEND_KEY_RESULT]
            traceback = result_dict[CeleryConstants.CELERY_RESULT_BACKEND_KEY_TRACEBACK]
            state = result_dict[CeleryConstants.CELERY_RESULT_BACKEND_KEY_STATUS]

            task_failed = state in states.EXCEPTION_STATES

            # show exceptions
            if task_failed:

                # handle error
                if handle_error is not None:
                    handle_error(task_id, result, state, traceback)

                # we need locking here because operation is not atomic
                with self.lock:
                    self.analyze_stats_view.failed_tasks += 1

            else:
                if handle_success is not None:
                    handle_success(task_id, result)

                # we need locking here because operation is not atomic
                with self.lock:
                    self.analyze_stats_view.successful_tasks += 1
 def before_task_publish_action(self, *args, **kwargs):
     ''' Collect task ids before they get published '''
     task_id = kwargs["body"]["id"]
     log.debug("will publish task %s", task_id)
     self.task_collection.task_ids.append(task_id)
Example #15
0
    def _analyze(self):
        ''' See doc of :py:method:`.BaseAnalyzer.analyze`. '''

        # try to get registered workers
        # it network fails at this point -> stop analysis
        try:
            clilog.info(CeleryUtil.get_workers_and_check_network())
        except NetworkError as e:
            log.critical(e)
            return 0

        # storage objects
        storage = self.storage

        clilog.info("Number of apks to analyze: %d", self._cnt_apks)

        try:
            # get analyze task
            analyze_task = tasks[CeleryConstants.get_analyze_task_name()]

            # create storage
            storage.create_or_open_sub_storages()

            # send tasks
            start = time()

            # apk generator over .apk or apk hashes
            apk_gen = AnalyzeUtil.apk_id_or_raw_data_gen(
                self.apks, force_raw_data=self.serialize_apks)

            clilog.info("Task publishing progress:")

            # send and serialize .apks
            # if analysis via path serialize them!
            if self.serialize_apks:
                log.info("sending .apks to message broker")
                self.group_result = group_result = GroupResult(results=[])

                for args in self.send_apk_args_generator(apk_gen):
                    task = analyze_task.delay(*args)
                    group_result.add(task)

            # send only apk id and let fetch via mongodb
            else:
                log.info("sending ids of apks")

                task_group = group(
                    (analyze_task.s(*args)
                     for args in self.send_id_args_generator(apk_gen)))

                # publish tasks
                self.group_result = task_group()

            log.info("sending took %ss", (time() - start))
            sys.stderr.write("\nAnalysis progress:\n")

            # start showing analysis progress
            self.analyze_stats_view.start()

            # wait for results
            log.debug("joining on ResultGroup ... ")

            # setup callback
            callback_func = self.get_callback_func(self.success_handler,
                                                   self.error_handler)
            CeleryUtil.join_native(self.group_result,
                                   propagate=False,
                                   callback=callback_func)

            clilog.info("\nanalysis done ... ")
            log.info("distributed analysis took %ss", (time() - start))

            return self.stop_analysis_view()
        except DatabaseOpenError as e:
            log.critical(e)
            return 0

        except (KeyboardInterrupt, Exception) as e:
            if not isinstance(e, KeyboardInterrupt):
                log.exception(e)
            log.warn(
                "Interrupting distributed analysis ... Please wait a moment!")
            log.warn("revoking tasks on all workers ...")

            if celerysettings.CELERY_TASK_REVOCATION_ENABLED:
                # revoke tasks
                if self.group_result is None:
                    # revoke via task ids
                    log.debug("revoking while publishing tasks ...")

                    self.task_collection.revoke_all(terminate=True,
                                                    signal='SIGKILL')
                else:
                    # revoke via GroupResult if yet available/created
                    # first available after all tasks have been send
                    self.group_result.revoke(terminate=True, signal='SIGKILL')
                log.warn("revoked tasks and killed workers ...")

            #return number of analyzed apks
            return self.stop_analysis_view()
    def _analyze(self):
        ''' See doc of :py:method:BaseAnalyzer.analyze`. '''
        try:
            work_queue = self.work_queue

            # create worker pool
            log.debug("starting %s workers ...", self.concurrency)
            for _ in range(self.concurrency):
                p = Worker(self.script_list, self.script_hashes, self.min_script_needs,
                                                 work_queue, self.storage,
                                                 self.cnt_analyzed_apks, self.analyzed_apks, self.storage_results)
                self.workers.append(p)
                p.daemon = True

            # start workers
            for p in self.workers:
                p.start()

            # queue has size limit -> start workers first then enqueue items
            log.info("Loading apk paths into work queue ...")
            for apk_stuff in AnalyzeUtil.apk_gen(self.apks_or_paths):
                # task is apk with all scripts
                work_queue.put(apk_stuff)

            for _ in range(self.concurrency):
                # signal end-of-work
                work_queue.put(STOP_SENTINEL)

            # progress view for cli
            av = AnalysisStatsView(self.cnt_analyzed_apks, self._cnt_apks, self.analyzed_apks)
            av.daemon = True
            av.start()
            
            # block until workers finished
            work_queue.join()
            av.terminate()
            log.debug("joined on work queue ...")

            return self.cnt_analyzed_apks.value

        # try hot shutdown first
        except KeyboardInterrupt:
            log.warn("Hot shutdown ... ")
            try:
                log.warn("clearing work queue ... ")
                Util.clear_queue(work_queue)
                log.warn("cleared work queue ... ")
                
                for _ in range(self.concurrency):
                    # signal end-of-work
                    work_queue.put(STOP_SENTINEL)
                    
                for worker in self.workers:
                    worker.join()
                log.warn("waited for all workers ... ")

                return self.cnt_analyzed_apks.value

            # if user really wants make a cold shutdown -> kill processes
            except KeyboardInterrupt:
                log.warn("Cold shutdown ... ")
                log.warn("Hard shutdown wanted! Killing all workers!")

                # kill processes via SIGINT -> send CTRL-C
                for w in self.workers:
                    try:
                        os.kill(w.pid, signal.SIGINT)
                    except:
                        pass

                return self.cnt_analyzed_apks.value
Example #17
0
    def __recreate_collections(self, gridfs = False, res_collection = False):
        '''
        Drop and recreate collections.

        Parameters
        ----------
        gridfs : bool, optional (default is False)
            Recreate gridfs.
        res_collection, bool, optional (default is False)
            Recreate results collection.
        '''
        try:
            if gridfs:
                log.debug("dropping collection %s", GRIDFS_COLLS_PREFIX)

                log.debug("dropping collection %s", FILES_COLL_NAME)
                self.db.drop_collection(FILES_COLL_NAME)

                log.debug("dropping collection %s", CHUNKS_COLL_NAME)
                self.db.drop_collection(CHUNKS_COLL_NAME)

                log.debug("recreating collection %s", GRIDFS_COLLS_PREFIX)
                self._open_gridfs()

                self._create_idx_for_colls()
        except PyMongoError as e:
            log.critical(e)

        try:
            if res_collection:
                log.debug("dropping collection %s", RESULT_DOCUMENTS_COLLECTION_NAME)
                self.db.drop_collection(RESULT_DOCUMENTS_COLLECTION_NAME)
                self._open_res_coll()
                log.debug("recreating collection %s", RESULT_DOCUMENTS_COLLECTION_NAME)
        except PyMongoError as e:
            log.critical(e)
Example #18
0
    def __init__(self, db_name = None, dest_addr = None, dest_port = None,
                # auth
                username = None, passwd = None,
                # ssl
                use_ssl = False, ssl_ca_certs = None,
                
                ):
        '''
        Create (if not existing) and open the database and collections.

        Parameters
        ----------
        db_name : str, optional (default is "res")
            The name of the database to use.
            Will be created if not already existing.
        dest_addr : str, optional (default is '127.0.0.1')
            Address of mongodb database server.
        dest_port : int, optional (default is 27017)
            Port of mongodb database server.
        username : str, optional (default is None)
            No authentication at all.
        passwd : str, optional (default is None)
            No authentication at all.
        use_ssl : bool, optional (default is False)
            Use ssl for the connection.
        ssl_ca_certs : str, optional (default is None)
            The CA certificate.
        
        Raises
        ------
        DatabaseOpenError
        '''

        # db name not allowed
        if db_name == APK_DB_NAME:
            raise DatabaseOpenError(db_name, msg = 'Database name "%s" reserved for apk storage!' % db_name), None, sys.exc_info()[2]

        # set default values
        if db_name is None:
            db_name = 'res'
        if dest_addr is None:
            dest_addr = '127.0.0.1'
        if dest_port is None:
            dest_port = 27017

        try:
            self.__db_name = db_name
            self.__dest_addr = dest_addr
            self.__dest_port = dest_port
            self.__use_ssl = use_ssl

            # only pass ssl parameters if ssl enabled
            ssl_params = dict(ssl = use_ssl, ssl_cert_reqs = ssl.CERT_NONE) if use_ssl else {}

            # set None cause if connection cannot be initiated, conn var will not in scope
            self.conn = None
            self.__conn = conn = pymongo.MongoClient(host = dest_addr, port = dest_port, **ssl_params)

            # authentication is per database!
            # do auth before probable db creation etc.
            if None not in (username, passwd):
                # authenticate if credentials given
                log.debug("authenticating with mongodb ...")
                conn["admin"].authenticate(username, passwd)
            else:
                log.debug("not authenticating with mongodb ... no credentials supplied!")

            self.__db = conn[self.db_name]

            # apk db
            self.__apk_db = conn[APK_DB_NAME]
            
            self.__apk_coll = gridfs.GridFS(self.__apk_db, GRIDFS_COLLS_PREFIX)

            # create/open collections
            self.__res_coll = self._open_res_coll()
            self.__files_coll = self.__db[GRIDFS_COLLS_PREFIX][GRIDFS_FILES]
            # grid fs for binary files, supports files > 16 mb
            self.__grid_fs = self._open_gridfs()

            # create indexes
            self._create_idx_for_colls()

            log.info("Opened database: %s", self)
            log.debug("CA certificate: %s", ssl_ca_certs)

        except PyMongoError as e:
            raise DatabaseOpenError(str(self), caused_by = e), None, sys.exc_info()[2]
Example #19
0
    def get_results(self,
                    include_fields = None, exclude_fields = None,
                    where = None, distinct_key = None,
                    n = None, sort = True, latest = False,
                    non_document = False, non_document_raw = False,
                    remove_id_field = True,
                    **kwargs):
        ''' See doc of :py:meth:`.ResultStorageInterface.get_results` '''

        if include_fields is not None and exclude_fields is not None:
            raise ValueError("include_fields and exclude_fields are mutually exclusive!")

        if include_fields is None:
            include_fields = []
        if exclude_fields is None:
            exclude_fields = []
        if where is None:
            where = {}

        # latest means enable sorting and only return one result
        if latest:
            sort = True
            n = 1

        # create projection dict
        fields = [(p, 0) for p in exclude_fields] + [(p, 1) for p in include_fields]

        if remove_id_field:
            # we don't want the id field
            fields += [(RESOBJ_ID, 0)]

        select = dict(fields)

        # no projection criteria given, disable!
            # because empty dict means only id
        if not select:
            select = None

        where.update(self.create_where_clause(kwargs, from_gridfs = non_document))

        try:
            res_cursor = None
            # get appropriate collection
            coll = self.__get_collection(gridfs_files_coll = non_document and not non_document_raw,
                                         gridfs_obj = non_document and non_document_raw)

            # pymongo 3.0 removed the as_class option in the collection.find method
            # this is the fix
            find_kwargs = {}
            if int(pymongo.version[0]) < 3:
                find_kwargs['as_class'] = OrderedDict
                
            # grid fs
            if non_document:
                if non_document_raw:
                    log.debug("mongodb query: find(%s) on gridfs", where)
                    res_cursor = coll.find(where)
                else:
                    # using the gridfs files collection directly enables us projection an attributes
                    log.debug("mongodb query: find(%s, %s) ", where, select)
                    res_cursor = coll.find(where, select, **find_kwargs)

            # normal collection
            else:
                res_cursor = coll.find(where, select, **find_kwargs)
                log.debug("mongodb query: find(%s, %s) ", where, select)


            # enable sorting if wanted
            if sort:
                # construct sorting criteria structure, structure is different if using gridfs
                sort_crit = [(
                  MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_ANALYSIS_DATE, gridfs=non_document)
                  , -1)]
                res_cursor = res_cursor.sort(sort_crit)

            # limit results if wanted
            if n is not None:
                res_cursor = res_cursor.limit(n)

            # generator that abstracts if normal collection or is gridfs
            if non_document:
                if non_document_raw:
                    return res_cursor

            if distinct_key is not None:
                res_cursor = res_cursor.distinct(distinct_key)

            return res_cursor

        except PyMongoError as e:
            raise DatabaseLoadException(self, "find(%s, %s)", where, select, caused_by = e), None, sys.exc_info()[2]
Example #20
0
 def before_task_publish_action(self, *args, **kwargs):
     ''' Collect task ids before they get published '''
     task_id = kwargs["body"]["id"]
     log.debug("will publish task %s", task_id)
     self.task_collection.task_ids.append(task_id)
Example #21
0
 def __del__(self):
     ''' Close db connection '''
     if self.conn is not None:
         log.debug("Closing db connection ... ")
         self.conn.close()
Example #22
0
    def fetch_results_from_mongodb(self, rds, results, wait_for_db = True,
                                   # progress
                                   nice_progess = False, synced_entries = None, total_sync_entries = None):
        '''
        Fetch some results from the result database and write them to disk.

        If data cannot be loaded from db, try until it can be.

        Parameters
        ----------
        rds : ResultDatabaseStorage
            The database to query for the results.
        results : list< tuple<id, gridfs (bool)> >
            Define which results shall be fetched.
        wait_for_db : bool, optional (default is True)
            Wait until data could be fetched from db.
        nice_progess : bool, optional (default is False)
            If enabled update show some nice progress bar on the cli.
        synced_entries : multiprocessing.Value<int>, optional (default is None)
            If supplied store number of already synces entries.
        total_sync_entries : multiprocessing.Value<int>, optional (default is None)
            If supplied store number of total entries to sync.

        Raises
        ------
        DatabaseLoadException
            If `wait_for_db` is False and an error occurred.
        '''
        # retry in ... seconds
        DATABASE_RETRY_TIME = 5

        # if true assume both counts are shared memory (Value)
        use_shared_memory = synced_entries is not None and total_sync_entries is not None

        if results is not None:
            results_stored = False
            while not results_stored:
                try:
                    # get ids
                    non_gridfs_ids, gridfs_ids = MongoUtil.split_result_ids(results)

                    # counts
                    cnt_non_gridfs_ids = len(non_gridfs_ids)
                    cnt_gridfs_ids = len(gridfs_ids)

                    if use_shared_memory:
                        total_sync_entries.value = cnt_gridfs_ids + cnt_non_gridfs_ids

                    # gridfs raw data as well as metadata
                    gridfs_entries_raw = []
                    if gridfs_ids:
                        gridfs_entries_raw = rds.get_results_for_ids(gridfs_ids, non_document = True, non_document_raw = True)

                    # regular documents (non gridfs)
                    non_gridfs_entries = []
                    if non_gridfs_ids:
                        non_gridfs_entries = rds.get_results_for_ids(non_gridfs_ids, non_document = False, non_document_raw = True)

                    if not nice_progess:
                        log.debug("fetching %d non-documents (gridfs) ... ", cnt_gridfs_ids)

                    for i, gridfs_entry_raw in enumerate(gridfs_entries_raw, 1):

                        # get our stored metadata (for script and apk)
                        gridfs_entry_meta = gridfs_entry_raw.metadata

                        if not nice_progess:
                            log.debug("getting results for %s", gridfs_entry_meta[RESOBJ_APK_META][RESOBJ_APK_META_PACKAGE_NAME])
                        else:
                            Util.print_dyn_progress(Util.format_progress(i, cnt_gridfs_ids))

                        # use apk to extract data from dict
                        fastapk = FastApk.load_from_result_dict(gridfs_entry_meta)
                        # get filename
                        file_name = gridfs_entry_raw.filename

                        # write results to disk
                        try:
                            self.store_custom_data(fastapk.package_name, fastapk.version_name, fastapk.hash, file_name, gridfs_entry_raw.read())
                        except FileSysStoreException as e:
                            log.exception(e)

                        # update shared memory progress indicitor
                        if use_shared_memory:
                            with synced_entries.get_lock():
                                synced_entries.value += 1

                    if not nice_progess:
                        log.debug("fetching %d documents (non-gridfs) ... ", cnt_non_gridfs_ids)

                    for i, non_gridfs_entry in enumerate(non_gridfs_entries, 1):
                        if not nice_progess:
                            clilog.debug("getting results for %s" % non_gridfs_entry[RESOBJ_APK_META][RESOBJ_APK_META_PACKAGE_NAME])
                        else:
                            Util.print_dyn_progress(Util.format_progress(i, cnt_non_gridfs_ids))

                        # write results to disk
                        self.store_result_dict(non_gridfs_entry)

                        # update shared memory progress indicitor
                        if use_shared_memory:
                            with synced_entries.get_lock():
                                synced_entries.value += 1

                    # if not wait for db wanted stop here
                    results_stored = True or not wait_for_db

                except (DatabaseLoadException, PyMongoError) as e:
                    if not wait_for_db:
                        raise
                    log.warn(e)
                    Util.log_will_retry(DATABASE_RETRY_TIME, exc = e)
                    sleep(DATABASE_RETRY_TIME)
Example #23
0
def import_scripts(script_list, via_package = False, _reload = False, clazz_name = None):
    '''
    Import the scripts (via file path or package name - configurable via `via_pacakge`).

    Parameters
    ----------
    script_list: list<str>
        list of script names (absolute path) or package names.
    via_package : bool, optional (default is False)
        If true, assume package names are given instead of file paths.
    _reload : bool, optional (default is False)
        Reload scripts and delete them from internal cache.
        Only possible if `via_package`.
    clazz_name : optional (default is None)
        The name of the class to import. If none, use the name of the module. 

    Returns
    -------
    list<type<AndroScript>>
        list of uninstantiated AndroScript classes

    Raises
    ------
    AnalyzeError
        If an NoAndroScriptSubclass, IOError or ModuleNotSameClassNameException has been raised.
    ImportError
    '''
    # late import -> pervent recursive import
    from androlyze.model.script.AndroScript import AndroScript
    from androlyze.analyze.exception import AnalyzeError
    androscripts = []

    # reload scripts if wanted
    if via_package and _reload:
        for script_package in script_list:
            log.debug("deleting %s from system modules", script_package)
            try:
                del sys.modules[script_package]
                log.debug("deleted")
            except KeyError:
                pass

    for script in script_list:
        class_name = clazz_name
        
        if not class_name:
            if via_package:
                class_name = script.split(".")[-1]
            else:
                class_name = basename(script.split(".py")[0])

        # class name must be equivalent to the module name!
        try:
            module_package = script
            # get package name from path and cut off file extension
            if not via_package:
                module_package = Util.path_2_package_name(script)
            module = importlib.import_module(module_package)
            clazz = getattr(module, class_name)
            # check if class is derived from AndroScript
            if isinstance(clazz, AndroScript.__class__):
                androscripts.append(clazz)
            else:
                raise NoAndroScriptSubclass(clazz), None, sys.exc_info()[2]
        except AttributeError as e:
            raise ModuleNotSameClassNameException(script, class_name), None, sys.exc_info()[2]
        except IOError as e:
            e.filename = script
            raise
        except (NoAndroScriptSubclass, ModuleNotSameClassNameException, IOError) as e:
            raise AnalyzeError(e), None, sys.exc_info()[2]

    return androscripts
Example #24
0
    def _analyze(self):
        ''' See doc of :py:method:BaseAnalyzer.analyze`. '''
        try:
            work_queue = self.work_queue

            # create worker pool
            log.debug("starting %s workers ...", self.concurrency)
            for _ in range(self.concurrency):
                p = Worker(self.script_list, self.script_hashes,
                           self.min_script_needs, work_queue, self.storage,
                           self.cnt_analyzed_apks, self.analyzed_apks,
                           self.storage_results)
                self.workers.append(p)
                p.daemon = True

            # start workers
            for p in self.workers:
                p.start()

            # queue has size limit -> start workers first then enqueue items
            log.info("Loading apk paths into work queue ...")
            for apk_stuff in AnalyzeUtil.apk_gen(self.apks_or_paths):
                # task is apk with all scripts
                work_queue.put(apk_stuff)

            for _ in range(self.concurrency):
                # signal end-of-work
                work_queue.put(STOP_SENTINEL)

            # progress view for cli
            av = AnalysisStatsView(self.cnt_analyzed_apks, self._cnt_apks,
                                   self.analyzed_apks)
            av.daemon = True
            av.start()

            # block until workers finished
            work_queue.join()
            av.terminate()
            log.debug("joined on work queue ...")

            return self.cnt_analyzed_apks.value

        # try hot shutdown first
        except KeyboardInterrupt:
            log.warn("Hot shutdown ... ")
            try:
                log.warn("clearing work queue ... ")
                Util.clear_queue(work_queue)
                log.warn("cleared work queue ... ")

                for _ in range(self.concurrency):
                    # signal end-of-work
                    work_queue.put(STOP_SENTINEL)

                for worker in self.workers:
                    worker.join()
                log.warn("waited for all workers ... ")

                return self.cnt_analyzed_apks.value

            # if user really wants make a cold shutdown -> kill processes
            except KeyboardInterrupt:
                log.warn("Cold shutdown ... ")
                log.warn("Hard shutdown wanted! Killing all workers!")

                # kill processes via SIGINT -> send CTRL-C
                for w in self.workers:
                    try:
                        os.kill(w.pid, signal.SIGINT)
                    except:
                        pass

                return self.cnt_analyzed_apks.value
Example #25
0
    def run_action(self, cmd):
        ''' Run an action specified by `cmd`(see COMMAND_ prefixed variables) '''

        parser = self.parser
        args = self.args

        # check which command has been used
        if cmd is None:

            # no command specified through program name -> get it from argparser
            cmd = args.command
            
        if cmd in COMMANDS_ALL:
            hashes, package_names, tags = CLIUtil.get_filter_options_from_cli(args)
            yes = args.yes

            if cmd == COMMAND_QUERY:
                self.action_query(hashes, package_names, tags, yes)

            # dblyze -> do the analysis results evaluation            
            elif cmd == COMMAND_EVAL:
                dblyze_scripts = ScriptUtil.import_scripts(args.scripts, clazz_name = "Eval")
                for dblyze_script in dblyze_scripts:
                    dblyze_script().evaluate(self.storage)
                
            # sync from result db to file sys
            elif cmd == COMMAND_SYNC:
                total_entries = androlyze.action_sync_fs(self.storage, lambda _ : False)

                CLIUtil.cli_check_n_exec(androlyze.action_sync_fs,
                                         prompt_prefix = "Will download %d entries from result database!" % total_entries,
                                         circumvent_check = args.yes,
                                         args = (self.storage, lambda _ : True)
                                         )
            else:
                # print welcome message
                clilog.info("Welcome to %s!\n" % PROJECT_NAME)

                # import command
                if cmd == COMMAND_IMPORT:
                    apks_or_paths, _ = self.get_apks_or_paths_from_cli()
                    tag = args.tag
                    copy2disk, copy2db, update, concurrency = args.copy_disk, args.copy_db, args.update, args.concurrency
                    if not update:
                        log.warn('''--update not supplied.
No update of already present apks in database will be done!''')
                    androlyze.action_import_apks(self.storage, apks_or_paths, copy2disk, copy2db, update, tag, concurrency = concurrency)
                # analyze command
                elif cmd == COMMAND_ANALYZE:
                    # androguard path has to be set before
                    from androlyze import action_analyze

                    # sort apks ?
                    get_apks_kwargs = {}
                    no_sort_by_code_size = args.no_sort_code_size
                    if not no_sort_by_code_size:
                        # sort apks by app code size for better scheduling
                        get_apks_kwargs = dict(order_by = TABLE_APK_IMPORT_KEY_SIZE_APP_CODE, ascending = False)
                    apks_or_paths, _ = self.get_apks_or_paths_from_cli(**get_apks_kwargs)

                    # debug infos
                    if not no_sort_by_code_size and not args.apks:
                        apks_or_paths, _it = itertools.tee(apks_or_paths)
                        clilog.info('Using Code Size Scheduling for faster analysis!')
                        log.debug('\n'.join(('%s: %s' % (x.package_name, x.size_app_code) for x in _it)))

                    scripts = args.scripts
                    parallel_mode, concurrency, send_id = self.__load_parallel_settings()

                    # get analysis mode
                    analyze_mode = None
                    if parallel_mode == PARALLELIZATION_MODE_DISTRIBUTED:
                        analyze_mode = ANALYZE_MODE_DISTRIBUTED
                    elif parallel_mode == PARALLELIZATION_MODE_NON_PARALLEL:
                        analyze_mode = ANALYZE_MODE_NON_PARALLEL
                    else:
                        analyze_mode = ANALYZE_MODE_PARALLEL
                    action_analyze(self.storage, scripts, apks_or_paths,
                                   mode = analyze_mode, concurrency = concurrency,
                                   serialize_apks = not send_id)
                # delete command
                elif cmd == COMMAND_DELETE:
                    self.action_delete(parser, hashes, package_names, tags, yes)

                clilog.info("done")
Example #26
0
def action_import_apks(storage, apk_paths,
                       copy_apk = False, copy_to_mongodb = False,
                       update = False, tag = None,
                       # shared memory
                       cnt_imported_apks = None, total_apk_count = None, import_finished = None,
                       # concurrent settings
                       concurrency = None
                       ):

    ''' Import the apks from the `apk_paths` and create the file system structure
    where the results will be kept, specified by `storage`.

    Parameters
    ----------
    storage : RedundantStorage
        The store to use.
    apk_paths : iterable<str>
        The apk files and/or directories.
    copy_apk : bool
        Import the apk file to the `import_dir` (copy it).
    copy_to_mongodb : bool, optional (default is False)
        Also import into MongoDB. Useful for the distributed analysis.
    update : bool
        Update apks that have already been imported.
    tag : str, optional (default is None)
        Some tag
    cnt_imported_apks : multiprocessing.Value<int>, optional (default is None)
        If given, use for progress updating.
    total_apk_count : multiprocessing.Value<int>, optional (default is None)
        If given, use for total count of apks.
    import_finished : multiprocessing.Value<byte>, optional (default is None)
        If given, use to signal that import has been completed.
    concurrency : int, optional (default is number of cpus)
        Number of processes to use for the import.
    '''
    from androlyze.loader.ApkImporter import ApkImporter

    # get single paths to apks so we get the correct total count of apks
    clilog.info("looking for apks in given paths ... ")
    apk_paths = ApkImporter.get_apks_from_list_or_dir(apk_paths)

    if total_apk_count is not None:
        # may be time consuming for recursive lookup
        apk_paths, total_apk_count.value = Util.count_iterable_n_clone(apk_paths)

    # create count if not given
    if cnt_imported_apks is None:
        cnt_imported_apks = Value('i', 0, lock = RLock())

    # set concurrency
    if concurrency is None:
        concurrency = cpu_count()
    log.warn("Using %d processes", concurrency)

    clilog.info("Storage dir is %s" % storage.fs_storage.store_root_dir)
    if copy_apk:
        clilog.info("Copying APKs to %s ..." % storage.fs_storage.store_root_dir)

    def import_apks(apk_paths):
        apk_importer = ApkImporter(apk_paths, storage)
        for apk in apk_importer.import_apks(copy_apk = copy_apk, copy_to_mongodb = copy_to_mongodb,
                                                update = update, tag = tag):

            clilog.info("imported %s", apk.short_description())

            # use shared memory counter if given
            if cnt_imported_apks is not None:
                with cnt_imported_apks.get_lock():
                    cnt_imported_apks.value += 1

    pool = []


    # don't convert generator to list if only 1 process wanted
    apk_paths = [apk_paths] if concurrency == 1 else Util.split_n_uniform_distri(list(apk_paths), concurrency)

    # start parallel import
    # multiprocessing's pool causes pickle errors
    for i in range(concurrency):
        p = Process(target = import_apks, args = (apk_paths[i], ))
        log.debug("starting process %s", p)
        pool.append(p)
        p.start()

    for it in pool:
        log.debug("joined on process %s", p)
        it.join()

    apks_imported = cnt_imported_apks.value != 0
    # show some message that no APK has been imported
    if not apks_imported:
        log.warn("No .apk file has been imported! This means no .apk file has been found or they already have been imported.")
    else:
        clilog.info("done")

    # because not all apks may be importable, we cannot use we count for signal that the import is done
    if import_finished is not None:
        import_finished.value = 1

    clilog.info("Imported %d apks", cnt_imported_apks.value)
    def _analyze(self):
        ''' See doc of :py:method:`.BaseAnalyzer.analyze`. '''

        # try to get registered workers
        # it network fails at this point -> stop analysis
        try:
            clilog.info(CeleryUtil.get_workers_and_check_network())
        except NetworkError as e:
            log.critical(e)
            return 0

        # storage objects
        storage = self.storage

        clilog.info("Number of apks to analyze: %d", self._cnt_apks)

        try:
            # get analyze task
            analyze_task = tasks[CeleryConstants.get_analyze_task_name()]

            # create storage
            storage.create_or_open_sub_storages()

            # send tasks
            start = time()

            # apk generator over .apk or apk hashes
            apk_gen = AnalyzeUtil.apk_id_or_raw_data_gen(self.apks, force_raw_data = self.serialize_apks)

            clilog.info("Task publishing progress:")

            # send and serialize .apks
            # if analysis via path serialize them!
            if self.serialize_apks:
                log.info("sending .apks to message broker")
                self.group_result = group_result = GroupResult(results = [])

                for args in self.send_apk_args_generator(apk_gen):
                    task = analyze_task.delay(*args)
                    group_result.add(task)

            # send only apk id and let fetch via mongodb
            else:
                log.info("sending ids of apks")

                task_group = group((analyze_task.s(*args) for args in self.send_id_args_generator(apk_gen)))

                # publish tasks
                self.group_result = task_group()

            log.info("sending took %ss", (time() - start))
            sys.stderr.write("\nAnalysis progress:\n")

            # start showing analysis progress
            self.analyze_stats_view.start()

            # wait for results
            log.debug("joining on ResultGroup ... ")

            # setup callback
            callback_func = self.get_callback_func(self.success_handler, self.error_handler)
            CeleryUtil.join_native(self.group_result, propagate = False, callback = callback_func)

            clilog.info("\nanalysis done ... ")
            log.info("distributed analysis took %ss", (time() - start))

            return self.stop_analysis_view()
        except DatabaseOpenError as e:
            log.critical(e)
            return 0

        except (KeyboardInterrupt, Exception) as e:
            if not isinstance(e, KeyboardInterrupt):
                log.exception(e)
            log.warn("Interrupting distributed analysis ... Please wait a moment!")
            log.warn("revoking tasks on all workers ...")

            if celerysettings.CELERY_TASK_REVOCATION_ENABLED:
                # revoke tasks
                if self.group_result is None:
                    # revoke via task ids
                    log.debug("revoking while publishing tasks ...")

                    self.task_collection.revoke_all(terminate = True, signal = 'SIGKILL')
                else:
                    # revoke via GroupResult if yet available/created
                    # first available after all tasks have been send
                    self.group_result.revoke(terminate = True, signal = 'SIGKILL')
                log.warn("revoked tasks and killed workers ...")

            #return number of analyzed apks
            return self.stop_analysis_view()