Ejemplo n.º 1
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        eco = arguments['ecosystem']
        pkg = arguments['name']
        ver = arguments['version']

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception:
            if not Ecosystem.by_name(
                    StoragePool.get_connected_storage('BayesianPostgres').
                    session, eco).is_backed_by(EcosystemBackend.maven):
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = self.run_scancode(cache_path)
        return result_data
    def execute(self, arguments):
        """Execute mercator and convert it's output to JSON object."""
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by(
                EcosystemBackend.maven):
            # cache_path now points directly to the pom
            cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml()
        else:
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()
        return self.run_mercator(arguments, cache_path)
    def execute(self, arguments):
        """Run oscryptocatcher tool for matching crypto algorithms."""
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(
            arguments).get_extracted_source_tarball()

        results = {'status': 'unknown', 'summary': {}, 'details': []}

        try:
            oscc = TimedCommand.get_command_output(
                ['oscryptocatcher', '--subdir-in-result', cache_path],
                graceful=False,
                is_json=True)

            self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc)
            results['details'] = oscc['details']
            results['summary'] = oscc['summary']
            results['status'] = 'success'
        except Exception:
            raise FatalTaskError('oscryptocatcher failed')

        return results
Ejemplo n.º 4
0
    def _python_scan(self, arguments):
        """Run OWASP dependency-check experimental analyzer for Python artifacts.

        https://jeremylong.github.io/DependencyCheck/analyzers/python.html
        """
        extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()
        # depcheck needs to be pointed to a specific file, we can't just scan whole directory
        egg_info = pkg_info = metadata = None
        for root, _, files in os.walk(extracted_tarball):
            if root.endswith('.egg-info') or root.endswith('.dist-info'):
                egg_info = root
            if 'PKG-INFO' in files:
                pkg_info = os.path.join(root, 'PKG-INFO')
            if 'METADATA' in files:
                metadata = os.path.join(root, 'METADATA')
        scan_path = egg_info or pkg_info or metadata
        if pkg_info and not egg_info:
            # Work-around for dependency-check ignoring PKG-INFO outside .dist-info/
            # https://github.com/jeremylong/DependencyCheck/issues/896
            egg_info_dir = os.path.join(extracted_tarball, arguments['name'] + '.egg-info')
            try:
                os.mkdir(egg_info_dir)
                copy(pkg_info, egg_info_dir)
                scan_path = egg_info_dir
            except os.error:
                self.log.warning('Failed to copy %s to %s', pkg_info, egg_info_dir)

        if not scan_path:
            raise FatalTaskError('File types not supported by OWASP dependency-check')

        return self._run_owasp_dep_check(scan_path, experimental=True)
Ejemplo n.º 5
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball()

        results = []
        for path in get_all_files_from(cache_path, path_filter=skip_git_files):
            self.log.debug("path = %s", path)

            bw = TimedCommand(['binwalk', '-B', path])
            status, output, error = bw.run(timeout=60)
            self.log.debug("status = %s, error = %s", status, error)
            self.log.debug("output = %s", output)

            parsed_binwalk = self.parse_binwalk(output)
            results.append({
                "path": os.path.relpath(path, cache_path),
                "output": parsed_binwalk,
            })
        return {'summary': [], 'status': 'success', 'details': results}
Ejemplo n.º 6
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        results = []
        cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()

        def worker(path):
            mime = TimedCommand.get_command_output(['file', path, '-b', '-i']).pop()
            self.log.debug("%s mime = %s", path, mime)
            typ = TimedCommand.get_command_output(['file', path, '-b'])
            self.log.debug("%s filetype = %s", path, typ)

            linguist = None
            if 'charset=binary' not in mime:
                linguist = self._parse_linguist(
                    TimedCommand.get_command_output(['linguist', path])
                )
                self.log.debug("%s linguist output = %s", path, linguist)

            results.append({
                "type": typ,
                "output": linguist,
                "path": os.path.relpath(path, cache_path),
            })

        with ThreadPool(target=worker) as tp:
            for path in get_all_files_from(cache_path, path_filter=skip_git_files):
                tp.add_task(path)

        return {'summary': [], 'status': 'success', 'details': results}
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        source_path = ObjectCache.get_from_dict(arguments).get_sources()
        header, language_stats = self._get_generic_result(source_path)

        for language in language_stats.keys():
            for handler in self._LANGUAGE_ANALYZER_HANDLERS.get(language, []):
                metrics_data = handler(self, source_path)
                if not metrics_data:
                    continue

                if 'metrics' not in language_stats[language]:
                    language_stats[language]['metrics'] = {}

                language_stats[language]['metrics'].update(metrics_data)

        # we don't want to have possibly unique keys and we want to avoid
        # enumerating all languages that are supported by cloc - convert a dict
        # to a list of language-specific entries
        result = {'languages': []}
        for language in language_stats.keys():
            record = language_stats.get(language)
            record['language'] = language
            result['languages'].append(record)

        return {'summary': header, 'status': 'success', 'details': result}
Ejemplo n.º 8
0
    def run(self, node_args):
        """To be transparently called by Selinon.

        Selinon transparently calls run(), which takes care of task audit and
        some additional checks and calls execute().
        """
        # SQS guarantees 'deliver at least once', so there could be multiple
        # messages of a type, give up immediately
        if self.storage and isinstance(self.storage,
                                       (BayesianPostgres, PackagePostgres)):
            if self.storage.get_worker_id_count(self.task_id) > 0:
                raise TaskAlreadyExistsError("Task with ID '%s'"
                                             " was already processed" %
                                             self.task_id)

        start = datetime.utcnow()
        try:
            result = self.execute(node_args)
        finally:
            # remove all files that were downloaded for this task
            ObjectCache.wipe()
        end = datetime.utcnow()

        if result:
            # Ensure result complies with the defined schema (if any) before saving
            self.validate_result(result)

        if result is None:
            # Keep track of None results and add _audit and _release keys
            result = {}

        if self.add_audit_info:
            # `_audit` key is added to every analysis info submitted
            result['_audit'] = {
                'started_at': json_serial(start),
                'ended_at': json_serial(end),
                'version': 'v1'
            }

            ecosystem_name = node_args.get('ecosystem')
            result['_release'] = '{}:{}:{}'.format(ecosystem_name,
                                                   node_args.get('name'),
                                                   node_args.get('version'))
        return result
Ejemplo n.º 9
0
    def execute(self, arguments):
        """Execute mercator and convert it's output to JSON object."""
        self._strict_assert(arguments.get('ecosystem'))

        if 'url' in arguments:
            # run mercator on a git repo
            return self.run_mercator_on_git_repo(arguments)

        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        # TODO: make this even uglier; looks like we didn't get the abstraction quite right
        #       when we were adding support for Java/Maven.
        if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by(
                EcosystemBackend.maven):
            # cache_path now points directly to the pom
            cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml()
        else:
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()
        return self.run_mercator(arguments, cache_path)
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        eco = arguments['ecosystem']
        pkg = arguments['name']
        ver = arguments['version']

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception:
            if arguments['ecosystem'] != 'maven':
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = self.run_scancode(cache_path)
        return result_data
Ejemplo n.º 11
0
 def _maven_scan(self, arguments):
     """Run OWASP dependency-check & Victims CVE DB CLI."""
     jar_path = ObjectCache.get_from_dict(arguments).get_source_tarball()
     results = self._run_owasp_dep_check(jar_path, experimental=False)
     if results.get('status') != 'success':
         return results
     # merge with Victims CVE DB results
     victims_cve_db_results = self._run_victims_cve_db_cli(arguments)
     for vulnerability in victims_cve_db_results:
         vulnerability = self._filter_victims_db_entry(vulnerability)
         if not vulnerability:
             continue
         if vulnerability['id'] not in results['summary']:
             results['summary'].append(vulnerability['id'])
             results['details'].append(vulnerability)
     return results
Ejemplo n.º 12
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        result_data = {'status': 'unknown', 'summary': [], 'details': {}}

        if self._is_valid_ecosystem(arguments['ecosystem']):
            hub = self._get_hub()

            # BlackDuck project doesn't have a notion of ecosystem, so we need to
            # namespace the project names ourselves, so for package `crumb` in the NPM ecosystem
            # we'll end up with the name `npm-crumb`
            project = self._get_project_name(arguments)
            version = arguments['version']

            # Check if the given project had already been scanned
            data = self._release_data(hub, project, version)

            if not data and self._allow_cli_scan:
                self.log.debug("No data available for project {p} {v}".format(
                    p=project, v=version))
                # No data available, issue a new scan and re-query release data
                source_tarball_path = ObjectCache.get_from_dict(
                    arguments).get_source_tarball()
                command = self._prepare_command(project, version,
                                                source_tarball_path)
                self.log.debug(
                    "Executing command, timeout={timeout}: {cmd}".format(
                        timeout=self._BLACKDUCK_CLI_TIMEOUT, cmd=command))
                bd = TimedCommand(command)
                status, output, error = \
                    bd.run(timeout=self._BLACKDUCK_CLI_TIMEOUT,
                           update_env={'BD_HUB_PASSWORD': self.configuration.BLACKDUCK_PASSWORD})
                self.log.debug("status = %s, error = %s", status, error)
                self.log.debug("output = %s", output)
                data = self._release_data(hub, project, version)

            self.log.debug("Release data for project {p} {v}: {d}".format(
                p=project, v=version, d=data))
            result_data['details'] = data
            result_data['status'] = 'success' if data else 'error'
        else:
            result_data['status'] = 'error'

        return result_data
Ejemplo n.º 13
0
    def execute(self, arguments):
        """
        task code

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        result_data = {'status': 'unknown', 'summary': [], 'details': []}

        source_tarball_path = ObjectCache.get_from_dict(
            arguments).get_source_tarball()
        sa = StaticAnalysis(source_tarball_path)

        try:
            analysis_result = sa.analyze()

            # make output reproducible - scanning the same
            # input multiple times should always produce
            # the same output
            del analysis_result["scan"]["time-created"]
            del analysis_result["scan"]["time-finished"]
            del analysis_result["scan"]["host"]
            del analysis_result["scan"]["store-results-to"]

            stats = {}
            for defect in analysis_result["defects"]:
                stats.setdefault(defect["checker"], {"count": 0})
                stats[defect["checker"]]["count"] += 1
                try:
                    stats[defect["checker"]]["cwe"] = defect["cwe"]
                except KeyError:
                    pass
            result_data['summary'] = stats
            result_data['status'] = 'success'
            result_data['details'] = analysis_result
        except Exception as ex:
            self.log.error("static analysis was not successful: %r", ex)
            result_data['status'] = 'error'

        return result_data
Ejemplo n.º 14
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        epv_cache = ObjectCache.get_from_dict(arguments)
        # cache_path = epv_cache.get_extracted_source_tarball()

        results = []
        # We don't compute digests of files in extracted tarball, only the tarball itself
        # for f in get_all_files_from(cache_path, path_filter=skip_git_files):
        #    results.append(self.compute_digests(cache_path, f))

        source_tarball_path = epv_cache.get_source_tarball()
        # Compute digests of tarball and mark it as such
        results.append(
            self.compute_digests(None, source_tarball_path, artifact=True))

        return {'summary': [], 'status': 'success', 'details': results}
Ejemplo n.º 15
0
    def run(self, node_args):
        """To be transparently called by Selinon.

        Selinon transparently calls run(), which takes care of task audit and
        some additional checks and calls execute().
        """
        # SQS guarantees 'deliver at least once', so there could be multiple
        # messages of a type, give up immediately
        if self.storage and isinstance(self.storage, (BayesianPostgres, PackagePostgres)):
            if self.storage.get_worker_id_count(self.task_id) > 0:
                raise TaskAlreadyExistsError("Task with ID '%s'"
                                             " was already processed" % self.task_id)

        start = datetime.utcnow()
        try:
            result = self.execute(node_args)

        except Exception as exc:
            if self.add_audit_info:
                # `_audit` key is added to every analysis info submitted
                end = datetime.utcnow()
                result = dict()

                self._add_audit_info(
                    task_result=result,
                    task_start=start,
                    task_end=end,
                    node_args=node_args,
                )

                # write the audit info to the storage
                self.storage.store_error(
                    node_args=node_args,
                    flow_name=self.flow_name,
                    task_name=self.task_name,
                    task_id=self.task_id,
                    exc_info=sys.exc_info(),
                    result=result
                )

            raise exc

        finally:
            # remove all files that were downloaded for this task
            ObjectCache.wipe()

        end = datetime.utcnow()

        if result:
            # Ensure result complies with the defined schema (if any) before saving
            self.validate_result(result)

        if result is None:
            # Keep track of None results and add _audit and _release keys
            result = {}

        if self.add_audit_info:
            # `_audit` key is added to every analysis info submitted
            self._add_audit_info(
                task_result=result,
                task_start=start,
                task_end=end,
                node_args=node_args,
            )

        return result
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(isinstance(arguments.get('ecosystem'), str))
        self._strict_assert(isinstance(arguments.get('name'), str))
        self._strict_assert(isinstance(arguments.get('version'), str))

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])

        # make sure we store package name in its normalized form
        arguments['name'] = normalize_package_name(ecosystem.backend.name,
                                                   arguments['name'])

        if len(pattern_ignore.findall(arguments['version'])) > 0:
            self.log.info("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))
            raise NotABugFatalTaskError("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))

        # Dont try ingestion for private packages
        if is_pkg_public(arguments['ecosystem'], arguments['name']):
            self.log.info("Ingestion flow for {} {}".format(
                arguments['ecosystem'], arguments['name']))
        else:
            self.log.info("Private package ingestion ignored {} {}".format(
                arguments['ecosystem'], arguments['name']))
            raise NotABugFatalTaskError("Private package alert {} {}".format(
                arguments['ecosystem'], arguments['name']))

        p = Package.get_or_create(db,
                                  ecosystem_id=ecosystem.id,
                                  name=arguments['name'])
        v = Version.get_or_create(db,
                                  package_id=p.id,
                                  identifier=arguments['version'])

        if not arguments.get('force'):
            if db.query(Analysis).filter(
                    Analysis.version_id == v.id).count() > 0:
                arguments['analysis_already_exists'] = True
                self.log.debug(
                    "Arguments returned by initAnalysisFlow without force: {}".
                    format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)
        npm_dir = self.configuration.NPM_DATA_DIR

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path)
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(
                            cache_path, ecosystem, arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'
                            .format(n=arguments.get('name'),
                                    v=arguments.get('version'),
                                    err=str(e)))

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(
                        cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)
            if arguments['ecosystem'] == "npm":
                shutil.rmtree(npm_dir, True)

        a = Analysis(version=v,
                     access_count=1,
                     started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug(
            "Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments
Ejemplo n.º 17
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        self._strict_assert(arguments.get('ecosystem'))

        # make sure we store package name based on ecosystem package naming case sensitivity
        arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name'])

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem'])

        p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name'])
        v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version'])

        if not arguments.get('force'):
            # TODO: this is OK for now, but if we will scale and there will be
            # 2+ workers running this task they can potentially schedule two
            # flows of a same type at the same time
            if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0:
                # we need to propagate flags that were passed to flow, but not
                # E/P/V - this way we are sure that for example graph import is
                # scheduled (arguments['force_graph_sync'] == True)
                arguments.pop('name')
                arguments.pop('version')
                arguments.pop('ecosystem')
                self.log.debug("Arguments returned by initAnalysisFlow without force: {}"
                               .format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path
                )
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(cache_path, ecosystem,
                                                                    arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'.
                            format(n=arguments.get('name'),
                                   v=arguments.get('version'),
                                   err=str(e))
                        )

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)

        a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments