def copy_apk(self, apk, file_like_obj, **kwargs):
        ''' See doc of :py:meth:`.ApkCopyInterface.copy_apk`.

        Inserts the apk from the `file_like_obj` into mongodb's gridfs,
        but only if not already in db.

        Returns
        -------
        The id of the apk (in db)
        '''
        file_like_obj.seek(0)
        try:
            gridfs = self.__apk_coll

            # escape keys accoring to mongodb rules
            apk_meta = escape_keys(apk.meta_dict())

            _id = apk.hash
            # gridfs doesn't have an update method -> delete and insert
            if not gridfs.exists(**{RESOBJ_ID : _id}):

                # store file together with metadata
                filename = os.path.basename(apk_meta[RESOBJ_APK_META][RESOBJ_APK_META_PATH])
                gridfs.put(file_like_obj, metadata = apk_meta[RESOBJ_APK_META], filename = filename, _id = _id, chunkSize = MAX_BSON_SIZE)
                log.info("put %s into %s", apk.short_description(), self)
        except (PyMongoError, BSONError) as e:
            raise DatabaseStoreException(self, "apk: %s" % apk.short_description(), caused_by = e), None, sys.exc_info()[2]

        # return id
        return _id
Example #2
0
    def __init__(self, import_db_name):
        '''
        Open the database and create the table structure if not already existing.

        Parameters
        ----------
        import_db_name : str
            Name of the database to use.

        Raises
        ------
        DatabaseOpenError
            If the database could not be opened or set up.
        '''
        log.info("Opening database %s", import_db_name)
        self.__db_name = import_db_name
        try:
            self.__conn = None
            self.__conn = sqlite3.connect(
                import_db_name,
                timeout=self.TIMEOUT,
                # use the declared type to determine the approriate converter/adapter
                # needed for date storage
                detect_types=sqlite3.PARSE_DECLTYPES)
            self.conn.row_factory = self.__key_val_description
            # create the tables if not existing
            self.__create()
            # upgrade db to latest layout
            self.__upgrade_db()
        except sqlite3.Error as e:
            raise DatabaseOpenError(import_db_name,
                                    caused_by=e), None, sys.exc_info()[2]
    def __init__(self, import_db_name):
        '''
        Open the database and create the table structure if not already existing.

        Parameters
        ----------
        import_db_name : str
            Name of the database to use.

        Raises
        ------
        DatabaseOpenError
            If the database could not be opened or set up.
        '''
        log.info("Opening database %s", import_db_name)
        self.__db_name = import_db_name
        try:
            self.__conn = None
            self.__conn = sqlite3.connect(import_db_name,
                timeout = self.TIMEOUT,
                # use the declared type to determine the approriate converter/adapter
                # needed for date storage
                detect_types = sqlite3.PARSE_DECLTYPES
                )
            self.conn.row_factory = self.__key_val_description
            # create the tables if not existing
            self.__create()
            # upgrade db to latest layout
            self.__upgrade_db()
        except sqlite3.Error as e:
            raise DatabaseOpenError(import_db_name, caused_by = e), None, sys.exc_info()[2]
Example #4
0
    def __init__(self, config_filename, import_db = None):
        '''
        Parameters
        ----------
        config_filename : str, optional (default is `settings.CONFIG_PATH`)
            The path to the config to load.
        import_db : str, optional (default is read from config file)
            Path to the import db.
        '''
        # type: Settings
        if config_filename is None:
            config_filename = settings.CONFIG_PATH

        # create settings variable
        self.__settings = Settings(config_filename, default_path = settings.DEFAULTS_PATH)

        log.debug("config file settings: %s\n\tCLI options may overwrite them!", self.__settings)

        # load and set androguard path from configs
        Util.set_androguard_path(self.settings)

        # type: str
        import_db = self._get_import_db(import_db = import_db)
        #self.args.import_database
        log.info("Using import database: %s", import_db)

        # load a few other settings
        self.__storage = self._create_storage(import_db)
Example #5
0
 def __init__(self, aws_id, aws_key, aws_bucket_name, s3_hostname = None):
     '''
     Parameters
     ----------
     aws_id : str
         ID of the Amazon AWS account.
     aws_key : str
         Key of the Amazon AWS account.
     aws_bucket_name : str
         Bucket name where the APKs are stored.
     s3_hostname : str, optional (default is None)
         The URL for the S3 storage.
         E.g. "s3-eu-west-1.amazonaws.com"
     '''
     
     self._s3_conn = S3Connection(aws_id, aws_key, host=s3_hostname)
     self._apk_bucket_name = aws_bucket_name
     
     if "." in aws_bucket_name:
         raise RuntimeError("Do not use '.' inside the bucket name: '%s'" % aws_bucket_name)
     
     try:
         self._apk_bucket = self.s3_conn.get_bucket(self.apk_bucket_name)
         
         log.info("opening %s", self)
     except (BotoClientError, S3ResponseError) as e:
         raise S3StorageOpenError(self.apk_bucket_name, caused_by = e)
Example #6
0
    def __init__(self,
                 storage,
                 script_list,
                 script_hashes,
                 min_script_needs,
                 apks_or_paths,
                 concurrency=None):
        '''
        See :py:method`.BaseAnalyzer.__init__` for details on the first attributes.

        Other Parameters
        ----------------
        concurrency : int, optional (default is number of cpu cores)
            Number of workers to spawn.
        '''
        super(ParallelAnalyzer,
              self).__init__(storage, script_list, script_hashes,
                             min_script_needs, apks_or_paths)

        # parallelization parameters
        if concurrency is None:
            concurrency = cpu_count()

        self.__concurrency = concurrency

        log.info("concurrency: %s", self.concurrency)
        log.info("Using processes")

        # parallel stuff, concerning processes
        self.__work_queue = Queue()
        self.__work_queue.cancel_join_thread()
        self.__workers = []

        self.__analyzed_apks = Queue()
Example #7
0
    def __init__(self, aws_id, aws_key, aws_bucket_name, s3_hostname=None):
        '''
        Parameters
        ----------
        aws_id : str
            ID of the Amazon AWS account.
        aws_key : str
            Key of the Amazon AWS account.
        aws_bucket_name : str
            Bucket name where the APKs are stored.
        s3_hostname : str, optional (default is None)
            The URL for the S3 storage.
            E.g. "s3-eu-west-1.amazonaws.com"
        '''

        self._s3_conn = S3Connection(aws_id, aws_key, host=s3_hostname)
        self._apk_bucket_name = aws_bucket_name

        if "." in aws_bucket_name:
            raise RuntimeError("Do not use '.' inside the bucket name: '%s'" %
                               aws_bucket_name)

        try:
            self._apk_bucket = self.s3_conn.get_bucket(self.apk_bucket_name)

            log.info("opening %s", self)
        except (BotoClientError, S3ResponseError) as e:
            raise S3StorageOpenError(self.apk_bucket_name, caused_by=e)
    def __init__(self,
                 storage, script_list, script_hashes, min_script_needs, apks_or_paths,
                 concurrency = None):
        '''
        See :py:method`.BaseAnalyzer.__init__` for details on the first attributes.

        Other Parameters
        ----------------
        concurrency : int, optional (default is number of cpu cores)
            Number of workers to spawn.
        '''
        super(ParallelAnalyzer, self).__init__(storage, script_list, script_hashes, min_script_needs, apks_or_paths)

        # parallelization parameters
        if concurrency is None:
            concurrency = cpu_count()

        self.__concurrency = concurrency

        log.info("concurrency: %s", self.concurrency)
        log.info("Using processes")

        # parallel stuff, concerning processes
        self.__work_queue = Queue()
        self.__work_queue.cancel_join_thread()
        self.__workers = []
        
        self.__analyzed_apks = Queue()
Example #9
0
 def __del__(self):
     ''' Close database '''
     try:
         log.info("Closing database %s", self.__db_name)
         if self.conn is not None:
             self.conn.close()
     except sqlite3.Error as e:
         log.warn(e)
 def __del__(self):
     ''' Close database '''
     try:
         log.info("Closing database %s", self.__db_name)
         if self.conn is not None:
             self.conn.close()
     except sqlite3.Error as e:
         log.warn(e)
Example #11
0
    def __setup_db(self):
        ''' Open database if not already done.
        Db will only be set up per process, not for each task!

        Raises
        ------
        StorageException
            Error while opening.
        '''
        if self.result_database_storage is None:
            log.info("setup_db ...")
            
            self.result_database_storage = ResultDatabaseStorage.factory_from_config(settings)
Example #12
0
    def __setup_scripts_reuse(self, androscripts, script_hashes):
        '''
        Setup scripts but first try to reuse them.
        This is done by comparing the hashes.

        If they equal -> reuse them!
        Otherwise reload from disk.

        Parameters
        ----------
        androscripts : list<str>
            List of package names.
        script_hashes : list<str>
            If given, set the hash for the `AndroScript`s

        Raises
        ------
        AnalyzeError
            If an NoAndroScriptSubclass, IOError or ModuleNotSameClassNameException has been raised.
        ImportError
        '''

        # need tuple to compare
        script_hashes = tuple(script_hashes)

        script_reload_needed = script_hashes != self.script_hashes

        # script can be reused -> simply reset them
        # stupid comparison cause same scripts in different order are not reused
        # but reusing is rather intended for a reuse in the same analysis (where the order is kept)
        if not script_reload_needed:

            log.info("reusing scripts ... ")
            for s in self.androscripts:
                s.reset()

        # cannot be reused
        else:
            log.info("reloading scripts cause hashes changed ... ")

            # (re)import script modules
            script_types = ScriptUtil.import_scripts(androscripts,
                                                     via_package=True,
                                                     _reload=True)

            # instantiate scripts and get classes
            self.androscripts = ScriptUtil.instantiate_scripts(
                script_types, script_hashes=script_hashes)

            # set hashes for next comparison
            self.script_hashes = script_hashes
Example #13
0
    def __setup_db(self):
        ''' Open database if not already done.
        Db will only be set up per process, not for each task!

        Raises
        ------
        StorageException
            Error while opening.
        '''
        if self.result_database_storage is None:
            log.info("setup_db ...")

            self.result_database_storage = ResultDatabaseStorage.factory_from_config(
                settings)
Example #14
0
    def __setup_scripts_reuse(self, androscripts, script_hashes):
        '''
        Setup scripts but first try to reuse them.
        This is done by comparing the hashes.

        If they equal -> reuse them!
        Otherwise reload from disk.

        Parameters
        ----------
        androscripts : list<str>
            List of package names.
        script_hashes : list<str>
            If given, set the hash for the `AndroScript`s

        Raises
        ------
        AnalyzeError
            If an NoAndroScriptSubclass, IOError or ModuleNotSameClassNameException has been raised.
        ImportError
        '''

        # need tuple to compare
        script_hashes = tuple(script_hashes)

        script_reload_needed = script_hashes != self.script_hashes

        # script can be reused -> simply reset them
        # stupid comparison cause same scripts in different order are not reused
        # but reusing is rather intended for a reuse in the same analysis (where the order is kept)
        if not script_reload_needed:

            log.info("reusing scripts ... ")
            for s in self.androscripts: s.reset()

        # cannot be reused
        else:
            log.info("reloading scripts cause hashes changed ... ")

            # (re)import script modules
            script_types = ScriptUtil.import_scripts(androscripts, via_package = True, _reload = True)

            # instantiate scripts and get classes
            self.androscripts = ScriptUtil.instantiate_scripts(script_types, script_hashes = script_hashes)

            # set hashes for next comparison
            self.script_hashes = script_hashes
Example #15
0
    def get_apk(self, _hash, **kwargs):
        '''
        Get the `EAndroApk` from `_hash`.

        Parameters
        ----------
        _hash : str
            Hash of the .apk (sha256)

        Raises
        ------
        DatabaseLoadException
        NoFile
            If the file is not present.

        Returns
        -------
        EAndroApk
            Apk constructed from raw data and meta infos.
        '''
        try:
            gridfs = self.__apk_coll
            log.info("getting apk: %s from mongodb ...", _hash)
            gridfs_obj = gridfs.get(_hash)
            # get raw .apk
            apk_zipfile = gridfs_obj.read()

            # get apk meta infos
            apk_meta = gridfs_obj.metadata
            package_name, version_name, path, _hash, import_date, tag = apk_meta[RESOBJ_APK_META_PACKAGE_NAME], apk_meta[RESOBJ_APK_META_VERSION_NAME], apk_meta[RESOBJ_APK_META_PATH], apk_meta[RESOBJ_APK_META_HASH], apk_meta[RESOBJ_APK_META_IMPORT_DATE], apk_meta[RESOBJ_APK_META_TAG]

            # use to hold apk meta infos
            fast_apk = FastApk(package_name, version_name, path, _hash, import_date, tag)
            eandro_apk = AnalyzeUtil.open_apk(apk_zipfile, fast_apk, raw = True)

            log.info("got apk")
            return eandro_apk
        except NoFile:
            raise
        except PyMongoError as e:
            raise DatabaseLoadException(self, content = "Apk (hash=%s)" % _hash, caused_by = e), None, sys.exc_info()[2]
Example #16
0
            def create_analyzer():

                analyzer = None
                # argument for BaseAnalyzer
                args = storage, androscript_list, script_hashes, min_script_needs, apks_or_paths
                log.info("Mode: %s", mode)

                # normal analyzer
                if mode == ANALYZE_MODE_NON_PARALLEL:
                    from androlyze.analyze.Analyzer import Analyzer
                    analyzer = Analyzer(*args)
                # use parallel analyzer
                elif mode == ANALYZE_MODE_PARALLEL:
                    from androlyze.analyze.parallel.ParallelAnalyzer import ParallelAnalyzer
                    analyzer = ParallelAnalyzer(*args, concurrency = concurrency)
                # use distributed one
                elif mode == ANALYZE_MODE_DISTRIBUTED:
                    from androlyze.analyze.distributed.DistributedAnalyzer import DistributedAnalyzer
                    analyzer = DistributedAnalyzer(*args, concurrency = concurrency, serialize_apks = serialize_apks)

                return analyzer
Example #17
0
    def prefetch_apk(self, task_id, task, *args, **kwargs):
        ''' Prefetch the `APK`s if mongodb is used as distributed apk storage.
        If the prefetch fails, the task will be retried.
        '''
        try:
            # open db if not already opened
            self.__setup_db()

            args = kwargs["args"]
            _, _, _, apk_zipfile_or_hash, is_hash, fast_apk = args
            # prefetch apk via hash if given
            if is_hash:
                # get apk from the apk storage
                eandro_apk = self.__get_apk_from_storage(apk_zipfile_or_hash, apk = fast_apk)
                if eandro_apk is not None:
                    # store in prefetch pool
                    apk_prefetch_pool[apk_zipfile_or_hash] = eandro_apk
                    log.info("prefetched: %s, size apk cache: %d", eandro_apk.short_description(), len(apk_prefetch_pool))
                    # abort if file not in db!
        except (NoFile, DatabaseOpenError, DatabaseLoadException) as e:
            log.exception(e)
Example #18
0
def set_androguard_path(settings):
    ''' Set the path to androguard from read from `settings` if not already in python path!

    Parameters
    ----------
    settings : Settings
    '''

    # check if path already set
    try:
        import androguard
        return
    except ImportError:
        pass

    from androlyze.settings import SECTION_ANDROGUARD, KEY_ANDROGUARD_PATH

    ANDROGUARD_PATH = settings[(SECTION_ANDROGUARD, KEY_ANDROGUARD_PATH)]
    # set androguard location before importing any androguard stuff
    sys.path.append(ANDROGUARD_PATH)
    log.info('appending "%s" to sys.path', ANDROGUARD_PATH)
Example #19
0
    def prefetch_apk(self, task_id, task, *args, **kwargs):
        ''' Prefetch the `APK`s if mongodb is used as distributed apk storage.
        If the prefetch fails, the task will be retried.
        '''
        try:
            # open db if not already opened
            self.__setup_db()

            args = kwargs["args"]
            _, _, _, apk_zipfile_or_hash, is_hash, fast_apk = args
            # prefetch apk via hash if given
            if is_hash:
                # get apk from the apk storage
                eandro_apk = self.__get_apk_from_storage(apk_zipfile_or_hash,
                                                         apk=fast_apk)
                if eandro_apk is not None:
                    # store in prefetch pool
                    apk_prefetch_pool[apk_zipfile_or_hash] = eandro_apk
                    log.info("prefetched: %s, size apk cache: %d",
                             eandro_apk.short_description(),
                             len(apk_prefetch_pool))
                    # abort if file not in db!
        except (NoFile, DatabaseOpenError, DatabaseLoadException) as e:
            log.exception(e)
Example #20
0
    def run(self, androscripts, min_script_needs, script_hashes, apk_zipfile_or_hash, is_hash = True, fast_apk = None):
        '''
        Do the analysis on the apk with the given scripts.

        Parameters
        ----------
        androscripts : list<str>
            List of package names.
        script_hashes : list<str>
            If given, set the hash for the `AndroScript`s
        min_script_needs : tuple<bool>
            See :py:method:`ScriptUtil.get_maximal_script_options`.
        apk_zipfile_or_hash : str
            The raw contents of the .apk file or the hash (sha256).
            The raw content of the .apk file (zipfile) or the hash of it (id in db).
        is_hash : bool, optional (default is True)
            Determines if `apk_zipfile_or_hash` is a hash (id).
        fast_apk : FastApk, optional (default is None)
            Holds the meta infos for the apk.

        Returns
        -------
        tuple<tuple<str, bool>>
            First component is the id of the entry
            and the second a boolean indication if the result has been stored in gridfs.
        ()
            If an error occurred.
        '''
        try:
            # method retry_arguments
            self.__retry_arguments = androscripts, min_script_needs, script_hashes, apk_zipfile_or_hash, is_hash, fast_apk
            eandro_apk = None
            do_script_hash_validation = settings.script_hash_validation_enabled()

            # open database/apk storage if not already done
            # reschedule job if connection/open error
            self.__open_db()
            self.__open_apk_storage()

            # setup scripts
            if do_script_hash_validation:
                # validate sent hashes with local script hashes
                self.__setup_scripts_hash_validation(androscripts, script_hashes)
            else:
                # reuse if possible
                self.__setup_scripts_reuse(androscripts, script_hashes)

            # open apk
            if not is_hash:
                log.info("opening apk via raw data ... ")
                eandro_apk = AnalyzeUtil.open_apk(apk_or_path = apk_zipfile_or_hash, apk = fast_apk, raw = True)
            else:
                # get apk from prefetched apk pool
                eandro_apk = apk_prefetch_pool.get(apk_zipfile_or_hash, None)
                # could not prefetch
                if eandro_apk is None:
                    eandro_apk = self.__get_apk_from_storage_retry(apk_zipfile_or_hash, apk = fast_apk)

            # if None, could not be opened and error has been logged
            if eandro_apk is not None:
                result = AnalyzeUtil.analyze_apk(eandro_apk, self.androscripts, min_script_needs, propagate_error = False, reset_scripts = not do_script_hash_validation)

                if result is not None:
                    fastapk, script_results = result

                    log.info("analyzed %s", fastapk.short_description())
                    storage_results = self.__store_results(fastapk, script_results)
                    # can be None if errorr occurred
                    if storage_results:
                        return tuple(storage_results)

            return ()
        except SoftTimeLimitExceeded:
            log.warn("Task %s exceeded it's soft time limit!", self)
            raise
        except ScriptHashValidationError:
            raise
        finally:
            # delete from pool -> we don't need it anymore in the pool
            if is_hash and apk_zipfile_or_hash in apk_prefetch_pool:
                del apk_prefetch_pool[apk_zipfile_or_hash]
Example #21
0
    def _analyze(self):
        ''' See doc of :py:method:`.BaseAnalyzer.analyze`. '''

        # try to get registered workers
        # it network fails at this point -> stop analysis
        try:
            clilog.info(CeleryUtil.get_workers_and_check_network())
        except NetworkError as e:
            log.critical(e)
            return 0

        # storage objects
        storage = self.storage

        clilog.info("Number of apks to analyze: %d", self._cnt_apks)

        try:
            # get analyze task
            analyze_task = tasks[CeleryConstants.get_analyze_task_name()]

            # create storage
            storage.create_or_open_sub_storages()

            # send tasks
            start = time()

            # apk generator over .apk or apk hashes
            apk_gen = AnalyzeUtil.apk_id_or_raw_data_gen(
                self.apks, force_raw_data=self.serialize_apks)

            clilog.info("Task publishing progress:")

            # send and serialize .apks
            # if analysis via path serialize them!
            if self.serialize_apks:
                log.info("sending .apks to message broker")
                self.group_result = group_result = GroupResult(results=[])

                for args in self.send_apk_args_generator(apk_gen):
                    task = analyze_task.delay(*args)
                    group_result.add(task)

            # send only apk id and let fetch via mongodb
            else:
                log.info("sending ids of apks")

                task_group = group(
                    (analyze_task.s(*args)
                     for args in self.send_id_args_generator(apk_gen)))

                # publish tasks
                self.group_result = task_group()

            log.info("sending took %ss", (time() - start))
            sys.stderr.write("\nAnalysis progress:\n")

            # start showing analysis progress
            self.analyze_stats_view.start()

            # wait for results
            log.debug("joining on ResultGroup ... ")

            # setup callback
            callback_func = self.get_callback_func(self.success_handler,
                                                   self.error_handler)
            CeleryUtil.join_native(self.group_result,
                                   propagate=False,
                                   callback=callback_func)

            clilog.info("\nanalysis done ... ")
            log.info("distributed analysis took %ss", (time() - start))

            return self.stop_analysis_view()
        except DatabaseOpenError as e:
            log.critical(e)
            return 0

        except (KeyboardInterrupt, Exception) as e:
            if not isinstance(e, KeyboardInterrupt):
                log.exception(e)
            log.warn(
                "Interrupting distributed analysis ... Please wait a moment!")
            log.warn("revoking tasks on all workers ...")

            if celerysettings.CELERY_TASK_REVOCATION_ENABLED:
                # revoke tasks
                if self.group_result is None:
                    # revoke via task ids
                    log.debug("revoking while publishing tasks ...")

                    self.task_collection.revoke_all(terminate=True,
                                                    signal='SIGKILL')
                else:
                    # revoke via GroupResult if yet available/created
                    # first available after all tasks have been send
                    self.group_result.revoke(terminate=True, signal='SIGKILL')
                log.warn("revoked tasks and killed workers ...")

            #return number of analyzed apks
            return self.stop_analysis_view()
Example #22
0
    def __init__(
        self,
        # import db stuff
        import_db_name,
        # file system stuff
        store_root_dir=None,
        # result db stuff
        result_db_name=None,
        result_db_addr=None,
        result_db_port=None,
        # auth
        result_db_username=None,
        result_db_passwd=None,
        # result db ssl stuff
        result_db_use_ssl=False,
        ssl_ca_cert=None,
        # set an apk storage
        distributed_apk_storage_factory=None,
    ):
        """
        Parameters
        ----------
        import_db_name : str
            Name of the database to use.
        store_root_dir: str, optional (default is None)
            Holds the path under which results will be stored.
            If no path is given, nothing will be stored in the file system at all.
        result_db_name : str, optional (default is "res")
            The name of the database to use.
            Will be created if not already existing.
        result_db_addr : str, optional (default is '127.0.0.1')
            Address of mongodb database server.
        result_db_port : int, optional (default is 27017)
            Port of mongodb database server.
        result_db_username : str, optional (default is None)
            No authentication at all.
        result_db_passwd : str, optional (default is None)
            No authentication at all.
        result_db_use_ssl : bool, optional (default is False)
            Use ssl for the connection.
        ssl_ca_cert : str, optional (default is None)
            The CA certificate.
            
        distributed_apk_storage_factory : function, optional (default is None)
            A function returning an object implementing the `ApkCopyInterface`.
            Use the function to create the storage only on demand.  
        """
        self.__apk_distributed_storage = None

        # store all variables we need for creation of the storages
        # so that they can be created on demand
        self.__import_db_name = import_db_name
        self.__store_root_dir = store_root_dir
        self.__result_db_name = result_db_name
        self.__result_db_addr = result_db_addr
        self.__result_db_port = result_db_port
        self.__result_db_use_ssl = result_db_use_ssl
        self.__result_db_ca_cert = ssl_ca_cert
        self.__apk_storage_factory = distributed_apk_storage_factory

        # auth
        # store credentials for lazy creating of database
        # but dont forget it do delete after db creation!
        self.__username = result_db_username
        self.__passwd = result_db_passwd

        # create them on demand via the getters
        self.__import_db_storage = None
        self.__fs_storage = None
        self.__result_db_storage = None
        self.__apk_storage = None

        if self.fs_storage_disabled():
            log.info("File system result writing disabled!")
Example #23
0
    def _analyze(self):
        ''' See doc of :py:method:BaseAnalyzer.analyze`. '''
        try:
            work_queue = self.work_queue

            # create worker pool
            log.debug("starting %s workers ...", self.concurrency)
            for _ in range(self.concurrency):
                p = Worker(self.script_list, self.script_hashes,
                           self.min_script_needs, work_queue, self.storage,
                           self.cnt_analyzed_apks, self.analyzed_apks,
                           self.storage_results)
                self.workers.append(p)
                p.daemon = True

            # start workers
            for p in self.workers:
                p.start()

            # queue has size limit -> start workers first then enqueue items
            log.info("Loading apk paths into work queue ...")
            for apk_stuff in AnalyzeUtil.apk_gen(self.apks_or_paths):
                # task is apk with all scripts
                work_queue.put(apk_stuff)

            for _ in range(self.concurrency):
                # signal end-of-work
                work_queue.put(STOP_SENTINEL)

            # progress view for cli
            av = AnalysisStatsView(self.cnt_analyzed_apks, self._cnt_apks,
                                   self.analyzed_apks)
            av.daemon = True
            av.start()

            # block until workers finished
            work_queue.join()
            av.terminate()
            log.debug("joined on work queue ...")

            return self.cnt_analyzed_apks.value

        # try hot shutdown first
        except KeyboardInterrupt:
            log.warn("Hot shutdown ... ")
            try:
                log.warn("clearing work queue ... ")
                Util.clear_queue(work_queue)
                log.warn("cleared work queue ... ")

                for _ in range(self.concurrency):
                    # signal end-of-work
                    work_queue.put(STOP_SENTINEL)

                for worker in self.workers:
                    worker.join()
                log.warn("waited for all workers ... ")

                return self.cnt_analyzed_apks.value

            # if user really wants make a cold shutdown -> kill processes
            except KeyboardInterrupt:
                log.warn("Cold shutdown ... ")
                log.warn("Hard shutdown wanted! Killing all workers!")

                # kill processes via SIGINT -> send CTRL-C
                for w in self.workers:
                    try:
                        os.kill(w.pid, signal.SIGINT)
                    except:
                        pass

                return self.cnt_analyzed_apks.value
Example #24
0
def create_analyzer(storage, script_list, apks_or_paths = None,
                   mode = ANALYZE_MODE_PARALLEL, concurrency = None,
                   serialize_apks = True
                   ):
    '''
    Create the analyzer only.

    Parameters
    ----------
    storage : RedundantStorage
        The store to use.
    script_list : list<str>
        List of paths to scripts (complete filename with extension).
    apks_or_paths: list<str> or list<Apk>, optional (default is None)
        List of `Apk` or paths to the apks which shall be analyzed with the given scripts
        If you analyze from paths the `import_date` is not set!
    mode : str, optional (default is `ANALYZE_MODE_PARALLEL`)
        Do an parallel analysis by default. Choose between : , , .
    concurrency : int, optional (default is number of cpu cores)
        Number of workers to spawn.
    serialize_apks : bool, optional (default is True)
        If true, serialize .apk .
        Otherwise id (hash) of the apk will be send and fetched by the worker from the result db.
        Be sure to import the apks to the result db first!
    '''
    from androlyze.model.script import ScriptUtil
    from androlyze.analyze.exception import AndroScriptError

    try:
        # list<type<AndroScript>>
        androscript_list = ScriptUtil.import_scripts(script_list)
        instantiated_scripts = sorted(ScriptUtil.instantiate_scripts(androscript_list, script_paths = script_list))

        if len(instantiated_scripts) == 0:
            log.warn("No scripts supplied!")
            return

        # get hashes for `AndroScript`s so that we can set the hash directly next time we instantiate the script
        script_hashes = [s.hash for s in instantiated_scripts]
        min_script_needs = ScriptUtil.get_minimum_script_options(instantiated_scripts)

        # log infos about scripts
        clilog.info('Loaded scripts:\n%s', '\n'.join((str(s) for s in instantiated_scripts)))
        log.info(ScriptUtil.androscript_options_descr(instantiated_scripts))

        if apks_or_paths:

            def create_analyzer():

                analyzer = None
                # argument for BaseAnalyzer
                args = storage, androscript_list, script_hashes, min_script_needs, apks_or_paths
                log.info("Mode: %s", mode)

                # normal analyzer
                if mode == ANALYZE_MODE_NON_PARALLEL:
                    from androlyze.analyze.Analyzer import Analyzer
                    analyzer = Analyzer(*args)
                # use parallel analyzer
                elif mode == ANALYZE_MODE_PARALLEL:
                    from androlyze.analyze.parallel.ParallelAnalyzer import ParallelAnalyzer
                    analyzer = ParallelAnalyzer(*args, concurrency = concurrency)
                # use distributed one
                elif mode == ANALYZE_MODE_DISTRIBUTED:
                    from androlyze.analyze.distributed.DistributedAnalyzer import DistributedAnalyzer
                    analyzer = DistributedAnalyzer(*args, concurrency = concurrency, serialize_apks = serialize_apks)

                return analyzer

            return create_analyzer()

    except ApkImportError as e:
        log.warn(e)
    except IOError as e:
        log.warn(AndroScriptError(e.filename, caused_by = e))
        sys.exit(1)
    except ImportError as e:
        log.exception(e)
    except Exception as e:
        log.exception(e)
    def _analyze(self):
        ''' See doc of :py:method:BaseAnalyzer.analyze`. '''
        try:
            work_queue = self.work_queue

            # create worker pool
            log.debug("starting %s workers ...", self.concurrency)
            for _ in range(self.concurrency):
                p = Worker(self.script_list, self.script_hashes, self.min_script_needs,
                                                 work_queue, self.storage,
                                                 self.cnt_analyzed_apks, self.analyzed_apks, self.storage_results)
                self.workers.append(p)
                p.daemon = True

            # start workers
            for p in self.workers:
                p.start()

            # queue has size limit -> start workers first then enqueue items
            log.info("Loading apk paths into work queue ...")
            for apk_stuff in AnalyzeUtil.apk_gen(self.apks_or_paths):
                # task is apk with all scripts
                work_queue.put(apk_stuff)

            for _ in range(self.concurrency):
                # signal end-of-work
                work_queue.put(STOP_SENTINEL)

            # progress view for cli
            av = AnalysisStatsView(self.cnt_analyzed_apks, self._cnt_apks, self.analyzed_apks)
            av.daemon = True
            av.start()
            
            # block until workers finished
            work_queue.join()
            av.terminate()
            log.debug("joined on work queue ...")

            return self.cnt_analyzed_apks.value

        # try hot shutdown first
        except KeyboardInterrupt:
            log.warn("Hot shutdown ... ")
            try:
                log.warn("clearing work queue ... ")
                Util.clear_queue(work_queue)
                log.warn("cleared work queue ... ")
                
                for _ in range(self.concurrency):
                    # signal end-of-work
                    work_queue.put(STOP_SENTINEL)
                    
                for worker in self.workers:
                    worker.join()
                log.warn("waited for all workers ... ")

                return self.cnt_analyzed_apks.value

            # if user really wants make a cold shutdown -> kill processes
            except KeyboardInterrupt:
                log.warn("Cold shutdown ... ")
                log.warn("Hard shutdown wanted! Killing all workers!")

                # kill processes via SIGINT -> send CTRL-C
                for w in self.workers:
                    try:
                        os.kill(w.pid, signal.SIGINT)
                    except:
                        pass

                return self.cnt_analyzed_apks.value
Example #26
0
    def run(self,
            androscripts,
            min_script_needs,
            script_hashes,
            apk_zipfile_or_hash,
            is_hash=True,
            fast_apk=None):
        '''
        Do the analysis on the apk with the given scripts.

        Parameters
        ----------
        androscripts : list<str>
            List of package names.
        script_hashes : list<str>
            If given, set the hash for the `AndroScript`s
        min_script_needs : tuple<bool>
            See :py:method:`ScriptUtil.get_maximal_script_options`.
        apk_zipfile_or_hash : str
            The raw contents of the .apk file or the hash (sha256).
            The raw content of the .apk file (zipfile) or the hash of it (id in db).
        is_hash : bool, optional (default is True)
            Determines if `apk_zipfile_or_hash` is a hash (id).
        fast_apk : FastApk, optional (default is None)
            Holds the meta infos for the apk.

        Returns
        -------
        tuple<tuple<str, bool>>
            First component is the id of the entry
            and the second a boolean indication if the result has been stored in gridfs.
        ()
            If an error occurred.
        '''
        try:
            # method retry_arguments
            self.__retry_arguments = androscripts, min_script_needs, script_hashes, apk_zipfile_or_hash, is_hash, fast_apk
            eandro_apk = None
            do_script_hash_validation = settings.script_hash_validation_enabled(
            )

            # open database/apk storage if not already done
            # reschedule job if connection/open error
            self.__open_db()
            self.__open_apk_storage()

            # setup scripts
            if do_script_hash_validation:
                # validate sent hashes with local script hashes
                self.__setup_scripts_hash_validation(androscripts,
                                                     script_hashes)
            else:
                # reuse if possible
                self.__setup_scripts_reuse(androscripts, script_hashes)

            # open apk
            if not is_hash:
                log.info("opening apk via raw data ... ")
                eandro_apk = AnalyzeUtil.open_apk(
                    apk_or_path=apk_zipfile_or_hash, apk=fast_apk, raw=True)
            else:
                # get apk from prefetched apk pool
                eandro_apk = apk_prefetch_pool.get(apk_zipfile_or_hash, None)
                # could not prefetch
                if eandro_apk is None:
                    eandro_apk = self.__get_apk_from_storage_retry(
                        apk_zipfile_or_hash, apk=fast_apk)

            # if None, could not be opened and error has been logged
            if eandro_apk is not None:
                result = AnalyzeUtil.analyze_apk(
                    eandro_apk,
                    self.androscripts,
                    min_script_needs,
                    propagate_error=False,
                    reset_scripts=not do_script_hash_validation)

                if result is not None:
                    fastapk, script_results = result

                    log.info("analyzed %s", fastapk.short_description())
                    storage_results = self.__store_results(
                        fastapk, script_results)
                    # can be None if errorr occurred
                    if storage_results:
                        return tuple(storage_results)

            return ()
        except SoftTimeLimitExceeded:
            log.warn("Task %s exceeded it's soft time limit!", self)
            raise
        except ScriptHashValidationError:
            raise
        finally:
            # delete from pool -> we don't need it anymore in the pool
            if is_hash and apk_zipfile_or_hash in apk_prefetch_pool:
                del apk_prefetch_pool[apk_zipfile_or_hash]
Example #27
0
    def __init__(self, db_name = None, dest_addr = None, dest_port = None,
                # auth
                username = None, passwd = None,
                # ssl
                use_ssl = False, ssl_ca_certs = None,
                
                ):
        '''
        Create (if not existing) and open the database and collections.

        Parameters
        ----------
        db_name : str, optional (default is "res")
            The name of the database to use.
            Will be created if not already existing.
        dest_addr : str, optional (default is '127.0.0.1')
            Address of mongodb database server.
        dest_port : int, optional (default is 27017)
            Port of mongodb database server.
        username : str, optional (default is None)
            No authentication at all.
        passwd : str, optional (default is None)
            No authentication at all.
        use_ssl : bool, optional (default is False)
            Use ssl for the connection.
        ssl_ca_certs : str, optional (default is None)
            The CA certificate.
        
        Raises
        ------
        DatabaseOpenError
        '''

        # db name not allowed
        if db_name == APK_DB_NAME:
            raise DatabaseOpenError(db_name, msg = 'Database name "%s" reserved for apk storage!' % db_name), None, sys.exc_info()[2]

        # set default values
        if db_name is None:
            db_name = 'res'
        if dest_addr is None:
            dest_addr = '127.0.0.1'
        if dest_port is None:
            dest_port = 27017

        try:
            self.__db_name = db_name
            self.__dest_addr = dest_addr
            self.__dest_port = dest_port
            self.__use_ssl = use_ssl

            # only pass ssl parameters if ssl enabled
            ssl_params = dict(ssl = use_ssl, ssl_cert_reqs = ssl.CERT_NONE) if use_ssl else {}

            # set None cause if connection cannot be initiated, conn var will not in scope
            self.conn = None
            self.__conn = conn = pymongo.MongoClient(host = dest_addr, port = dest_port, **ssl_params)

            # authentication is per database!
            # do auth before probable db creation etc.
            if None not in (username, passwd):
                # authenticate if credentials given
                log.debug("authenticating with mongodb ...")
                conn["admin"].authenticate(username, passwd)
            else:
                log.debug("not authenticating with mongodb ... no credentials supplied!")

            self.__db = conn[self.db_name]

            # apk db
            self.__apk_db = conn[APK_DB_NAME]
            
            self.__apk_coll = gridfs.GridFS(self.__apk_db, GRIDFS_COLLS_PREFIX)

            # create/open collections
            self.__res_coll = self._open_res_coll()
            self.__files_coll = self.__db[GRIDFS_COLLS_PREFIX][GRIDFS_FILES]
            # grid fs for binary files, supports files > 16 mb
            self.__grid_fs = self._open_gridfs()

            # create indexes
            self._create_idx_for_colls()

            log.info("Opened database: %s", self)
            log.debug("CA certificate: %s", ssl_ca_certs)

        except PyMongoError as e:
            raise DatabaseOpenError(str(self), caused_by = e), None, sys.exc_info()[2]
Example #28
0
    def __init__(self,
                 # import db stuff
                import_db_name,
                # file system stuff
                store_root_dir = None,
                # result db stuff
                result_db_name = None, result_db_addr = None, result_db_port = None,
                # auth
                result_db_username = None, result_db_passwd = None,
                # result db ssl stuff
                result_db_use_ssl = False, ssl_ca_cert = None,
                
                # set an apk storage
                distributed_apk_storage_factory = None
                ):
        '''
        Parameters
        ----------
        import_db_name : str
            Name of the database to use.
        store_root_dir: str, optional (default is None)
            Holds the path under which results will be stored.
            If no path is given, nothing will be stored in the file system at all.
        result_db_name : str, optional (default is "res")
            The name of the database to use.
            Will be created if not already existing.
        result_db_addr : str, optional (default is '127.0.0.1')
            Address of mongodb database server.
        result_db_port : int, optional (default is 27017)
            Port of mongodb database server.
        result_db_username : str, optional (default is None)
            No authentication at all.
        result_db_passwd : str, optional (default is None)
            No authentication at all.
        result_db_use_ssl : bool, optional (default is False)
            Use ssl for the connection.
        ssl_ca_cert : str, optional (default is None)
            The CA certificate.
            
        distributed_apk_storage_factory : function, optional (default is None)
            A function returning an object implementing the `ApkCopyInterface`.
            Use the function to create the storage only on demand.  
        '''
        self.__apk_distributed_storage = None
        
        # store all variables we need for creation of the storages
        # so that they can be created on demand
        self.__import_db_name = import_db_name
        self.__store_root_dir = store_root_dir
        self.__result_db_name = result_db_name
        self.__result_db_addr = result_db_addr
        self.__result_db_port = result_db_port
        self.__result_db_use_ssl = result_db_use_ssl
        self.__result_db_ca_cert = ssl_ca_cert
        self.__apk_storage_factory = distributed_apk_storage_factory

        # auth
        # store credentials for lazy creating of database
        # but dont forget it do delete after db creation!
        self.__username = result_db_username
        self.__passwd = result_db_passwd

        # create them on demand via the getters
        self.__import_db_storage = None
        self.__fs_storage = None
        self.__result_db_storage = None
        self.__apk_storage = None

        if self.fs_storage_disabled():
            log.info('File system result writing disabled!')
    def _analyze(self):
        ''' See doc of :py:method:`.BaseAnalyzer.analyze`. '''

        # try to get registered workers
        # it network fails at this point -> stop analysis
        try:
            clilog.info(CeleryUtil.get_workers_and_check_network())
        except NetworkError as e:
            log.critical(e)
            return 0

        # storage objects
        storage = self.storage

        clilog.info("Number of apks to analyze: %d", self._cnt_apks)

        try:
            # get analyze task
            analyze_task = tasks[CeleryConstants.get_analyze_task_name()]

            # create storage
            storage.create_or_open_sub_storages()

            # send tasks
            start = time()

            # apk generator over .apk or apk hashes
            apk_gen = AnalyzeUtil.apk_id_or_raw_data_gen(self.apks, force_raw_data = self.serialize_apks)

            clilog.info("Task publishing progress:")

            # send and serialize .apks
            # if analysis via path serialize them!
            if self.serialize_apks:
                log.info("sending .apks to message broker")
                self.group_result = group_result = GroupResult(results = [])

                for args in self.send_apk_args_generator(apk_gen):
                    task = analyze_task.delay(*args)
                    group_result.add(task)

            # send only apk id and let fetch via mongodb
            else:
                log.info("sending ids of apks")

                task_group = group((analyze_task.s(*args) for args in self.send_id_args_generator(apk_gen)))

                # publish tasks
                self.group_result = task_group()

            log.info("sending took %ss", (time() - start))
            sys.stderr.write("\nAnalysis progress:\n")

            # start showing analysis progress
            self.analyze_stats_view.start()

            # wait for results
            log.debug("joining on ResultGroup ... ")

            # setup callback
            callback_func = self.get_callback_func(self.success_handler, self.error_handler)
            CeleryUtil.join_native(self.group_result, propagate = False, callback = callback_func)

            clilog.info("\nanalysis done ... ")
            log.info("distributed analysis took %ss", (time() - start))

            return self.stop_analysis_view()
        except DatabaseOpenError as e:
            log.critical(e)
            return 0

        except (KeyboardInterrupt, Exception) as e:
            if not isinstance(e, KeyboardInterrupt):
                log.exception(e)
            log.warn("Interrupting distributed analysis ... Please wait a moment!")
            log.warn("revoking tasks on all workers ...")

            if celerysettings.CELERY_TASK_REVOCATION_ENABLED:
                # revoke tasks
                if self.group_result is None:
                    # revoke via task ids
                    log.debug("revoking while publishing tasks ...")

                    self.task_collection.revoke_all(terminate = True, signal = 'SIGKILL')
                else:
                    # revoke via GroupResult if yet available/created
                    # first available after all tasks have been send
                    self.group_result.revoke(terminate = True, signal = 'SIGKILL')
                log.warn("revoked tasks and killed workers ...")

            #return number of analyzed apks
            return self.stop_analysis_view()