def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        eco = arguments['ecosystem']
        pkg = arguments['name']
        ver = arguments['version']

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception:
            if not Ecosystem.by_name(
                    StoragePool.get_connected_storage('BayesianPostgres').
                    session, eco).is_backed_by(EcosystemBackend.maven):
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = self.run_scancode(cache_path)
        return result_data
Example #2
0
    def retrieve_bookkeeping_for_ecosystem_package(self, ecosystem, package):
        """Retrieve BookKeeping data for given Package and Ecosystem.

        :param ecosystem: ecosystem for which the data should be retrieved
        :param package: package for which the data should be retrieved
        """
        e = Ecosystem.by_name(self.db, ecosystem)
        p = Package.by_name(self.db, package)

        stat = self.db.query(PackageWorkerResult).\
            join(PackageAnalysis).\
            filter(PackageAnalysis.package == p)
        worker_stats = []
        for package_worker_result in stat.all():
            entry = {"worker_name": package_worker_result.worker,
                     "has_error": package_worker_result.error,
                     "task_result": package_worker_result.task_result,
                     "started_at": package_worker_result.started_at,
                     "ended_at": package_worker_result.ended_at}
            worker_stats.append(entry)

        version_count = self.db.query(Version).join(Package).\
            filter(Package.ecosystem == e).\
            filter(Version.package == p).count()
        p_versions = self.db.query(Version).join(Package).join(Ecosystem).\
            filter(Package.ecosystem == e).\
            filter(Version.package == p)

        return {"ecosystem": e.name,
                "package": p.name,
                "package_version_count": version_count,
                "package_level_workers": worker_stats,
                "analysed_versions": [v.identifier for v in p_versions]}
Example #3
0
    def retrieve_bookkeeping_for_epv(self, ecosystem, package, version):
        """Retrieve BookKeeping data for the given ecosystem, package, and version.

        :param ecosystem: ecosystem for which the data should be retrieved
        :param package: package for which the data should be retrieved
        :param version: package version for which the data should be retrieved
        """
        e = Ecosystem.by_name(self.db, ecosystem)
        p = Package.by_name(self.db, package)
        v = self.db.query(Version).join(Package).join(Ecosystem). \
            filter(Package.ecosystem == e). \
            filter(Version.package == p). \
            filter(Version.identifier == version).one()

        stat = self.db.query(WorkerResult).\
            join(Analysis).join(Version).\
            filter(Analysis.version == v)
        worker_stats = []
        for worker_result in stat.all():
            entry = {"worker_name": worker_result.worker,
                     "has_error": worker_result.error,
                     "task_result": worker_result.task_result,
                     "started_at": worker_result.started_at,
                     "ended_at": worker_result.ended_at}
            worker_stats.append(entry)

        return {"ecosystem": e.name,
                "package": p.name,
                "version": v.identifier,
                "workers": worker_stats}
Example #4
0
def iter_unknown_dependencies(storage_pool, node_args):
    """Collect unknown dependencies."""
    # Be safe here as fatal errors will cause errors in Dispatcher
    try:
        aggregated = storage_pool.get('UnknownDependencyFetcherTask')

        arguments = []
        for element in aggregated["result"]:
            epv = element.split(':')
            ecosystem = epv[0]
            if Ecosystem.by_name(
                    StoragePool.get_connected_storage(
                        'BayesianPostgres').session,
                    ecosystem).is_backed_by(EcosystemBackend.maven):
                name = '{}:{}'.format(epv[1], epv[2])
                version = epv[3]
            else:
                name = epv[1]
                version = epv[2]
            analysis_arguments = _create_analysis_arguments(
                ecosystem, name, version)
            # TODO: Remove force=True once data-importer is smart enough
            # to ingest missing packages from s3.
            analysis_arguments.update({"recursive_limit": 0, "force": True})
            arguments.append(analysis_arguments)

        print('Arguments appended: %s' %
              ', '.join(str(item) for item in arguments))
        logger.info("Arguments for next flows: %s" % str(arguments))
        return arguments
    except Exception as e:
        logger.exception(
            "Failed to collect unknown dependencies due to {}".format(e))
        return []
def retrieve_bookkeeping_for_ecosystem(ecosystem):
    """Retrieve BookKeeping data for given Ecosystem.

    :param ecosystem: ecosystem for which the data should be retrieved
    """
    rdb = StoragePool.get_connected_storage('BayesianPostgres')
    db = rdb.session
    try:
        e = Ecosystem.by_name(db, ecosystem)
        package_count = _count(
            db,
            db.query(Package).filter(Package.ecosystem == e))
        pv_count = _count(
            db,
            db.query(Version).join(Package).filter(Package.ecosystem == e))
        result = {
            "summary": {
                "ecosystem": e.name,
                "package_count": package_count,
                "package_version_count": pv_count
            }
        }
    except NoResultFound as e:
        result = {"error": "No such ecosystem: %s" % ecosystem}
    except SQLAlchemyError as e:
        result = {
            "error":
            "Error encountered while fetching data. Please check logs."
        }

    return result
    def execute(self, arguments):
        """Task to mark vulnerable packages in graph.

        :param arguments: dictionary with task arguments
        :return: None
        """
        self._strict_assert(arguments.get('ecosystem'))

        wanted_cves = set(arguments.get('cve_filter', []))
        victims_cls = VictimsDB if not wanted_cves else FilteredVictimsDB

        rdb = StoragePool.get_connected_storage('BayesianPostgres')
        ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem'))

        with victims_cls.build_from_git(wanted=wanted_cves) as db:

            self.log.info('Storing the VictimsDB zip on S3')
            db.store_on_s3()

            vulnerable_packages = self.get_vulnerable_packages(db, ecosystem)
            self.create_in_graph(vulnerable_packages, ecosystem)

            self.mark_in_graph(vulnerable_packages, ecosystem)

            self.notify_gemini(vulnerable_packages, ecosystem)
Example #7
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))

        rdb_session = StoragePool.get_connected_storage(
            'BayesianPostgres').session

        name = arguments['name']
        ecosystem = arguments['ecosystem']
        if ecosystem == 'go':
            name = quote(name, safe='')

        project_url = self.configuration.libraries_io_project_url(
            Ecosystem.by_name(rdb_session, ecosystem), name)
        project = get_response(project_url)
        versions = project['versions']
        details = {
            'dependent_repositories': {
                'count': project['dependent_repos_count']
            },
            'dependents': {
                'count': project['dependents_count']
            },
            'releases': {
                'count': len(versions),
                'recent': self.recent_releases(versions)
            }
        }

        return {'status': 'success', 'summary': [], 'details': details}
Example #8
0
def _create_analysis_arguments(ecosystem, name, version):
    """Create arguments for analysis."""
    return {
        'ecosystem': ecosystem,
        'name': MavenCoordinates.normalize_str(name) if Ecosystem.by_name(
            StoragePool.get_connected_storage('BayesianPostgres').session,
            ecosystem).is_backed_by(
            EcosystemBackend.maven) else name,
        'version': version
    }
Example #9
0
    def retrieve_bookkeeping_for_ecosystem(self, ecosystem):
        """Retrieve BookKeeping data for given Ecosystem.

        :param ecosystem: ecosystem for which the data should be retrieved
        """
        e = Ecosystem.by_name(self.db, ecosystem)
        package_count = self.db.query(Package).filter(Package.ecosystem == e).count()
        pv_count = self.db.query(Version).join(Package).filter(Package.ecosystem == e).count()
        return {"ecosystem": e.name,
                "package_count": package_count,
                "package_version_count": pv_count}
Example #10
0
def normalize_package_name(ecosystem, name):
    """Normalize package name based on ecosystem."""
    normalized_name = name
    if Ecosystem.by_name(
            StoragePool.get_connected_storage('BayesianPostgres').session,
            ecosystem).is_backed_by(EcosystemBackend.pypi):
        case_sensitivity_transform(ecosystem, name)
    elif ecosystem == 'go':
        # go package name is the host+path part of a URL, thus it can be URL encoded
        normalized_name = unquote(name)
    return normalized_name
    def _normalize_package_name(self, node_args):
        """Normalize package name in node arguments."""
        if not node_args:
            return

        if 'name' in node_args and 'ecosystem' in node_args:
            ecosystem = Ecosystem.by_name(self.postgres.session,
                                          node_args['ecosystem'])
            node_args['name'] = normalize_package_name(
                ecosystem_backend=ecosystem.backend.name,
                name=node_args['name'])
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('ecosystem'))

        # get rid of version if scheduled from the core analyses
        arguments.pop('version', None)
        arguments.pop('document_id', None)

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])
        package = Package.get_or_create(db,
                                        ecosystem_id=ecosystem.id,
                                        name=arguments['name'])
        url = self.get_upstream_url(arguments)
        upstream = self.get_upstream_entry(package, url)
        if upstream is None:
            upstream = self.add_or_update_upstream(package, url)
        arguments['url'] = upstream.url

        if not arguments.get('force'):
            # can potentially schedule two flows of a same type at the same
            # time as there is no lock, but let's say it's OK
            if upstream.updated_at is not None \
                    and datetime.datetime.utcnow() - upstream.updated_at < self._UPDATE_INTERVAL:
                self.log.info(
                    'Skipping upstream package check as data are considered as recent - '
                    'last update %s.', upstream.updated_at)
                # keep track of start, but do not schedule nothing more
                # discard changes like updates
                db.rollback()
                return arguments

        # if this fails, it's actually OK, as there could be concurrency
        package_analysis = PackageAnalysis(
            package_id=package.id,
            started_at=datetime.datetime.utcnow(),
            finished_at=None)
        db.add(package_analysis)

        # keep track of updates
        upstream.updated_at = datetime.datetime.utcnow()

        db.commit()
        arguments['document_id'] = package_analysis.id
        return arguments
def retrieve_bookkeeping_for_ecosystem_package(ecosystem, package):
    """Retrieve BookKeeping data for given Package and Ecosystem.

    :param ecosystem: ecosystem for which the data should be retrieved
    :param package: package for which the data should be retrieved
    """
    rdb = StoragePool.get_connected_storage('BayesianPostgres')
    db = rdb.session

    try:
        e = Ecosystem.by_name(db, ecosystem)
        p = Package.by_name(db, package)

        version_count = _count(
            db,
            db.query(Version).join(Package).filter(
                Package.ecosystem == e).filter(Version.package == p))

        stat = db.query(PackageWorkerResult.worker, PackageWorkerResult.error,
                        PackageWorkerResult.task_result).join(PackageAnalysis). \
            filter(PackageAnalysis.package == p). \
            all()

        worker_stats = []
        for worker_name, has_error, task_result in stat:
            entry = {
                "worker_name": worker_name,
                "has_error": has_error,
                "task_result": task_result
            }
            worker_stats.append(entry)

        p_versions = db.query(Version).join(Package).join(Ecosystem). \
            filter(Package.ecosystem == e). \
            filter(Version.package == p)

        result = {
            "summary": {
                "ecosystem": e.name,
                "package": p.name,
                "package_version_count": version_count,
                "package_level_workers": worker_stats,
                "analysed_versions": [v.identifier for v in p_versions]
            }
        }
    except NoResultFound as e:
        result = {"error": "No such package: %s/%s" % (ecosystem, package)}
    except SQLAlchemyError as e:
        result = {
            "error":
            "Error encountered while fetching data. Please check logs."
        }
    return result
    def get_sources(self):
        """Get path to source files.

        :return: path to source files
        """
        if not self._eco_obj:
            self._eco_obj = Ecosystem.by_name(self._postgres.session, self.ecosystem)

        if self._eco_obj.is_backed_by(EcosystemBackend.maven):
            return self.get_extracted_source_jar()
        else:
            return self.get_extracted_source_tarball()
def case_sensitivity_transform(ecosystem, name):
    """Transform package name to lowercase for ecosystem that are not case sensitive.

    :param ecosystem: name of ecosystem in which the package is sits
    :param name: name of ecosystem
    :return: transformed package name base on ecosystem package case sensitivity
    """
    if Ecosystem.by_name(StoragePool.get_connected_storage('BayesianPostgres').session,
                         ecosystem).is_backed_by(EcosystemBackend.pypi):
        return name.lower()

    return name
    def has_sources(self):
        """Test if the given EPV has available sources.

        :return: true if the given EPV has available sources
        """
        if not self._eco_obj:
            self._eco_obj = Ecosystem.by_name(self._postgres.session, self.ecosystem)

        if self._eco_obj.is_backed_by(EcosystemBackend.maven):
            return self._s3.object_exists(self._source_jar_object_key)
        else:
            self._construct_source_tarball_names()
            return self._s3.object_exists(self._source_tarball_object_key)
def retrieve_bookkeeping_for_epv(ecosystem, package, version):
    """Retrieve BookKeeping data for the given ecosystem, package, and version.

    :param ecosystem: ecosystem for which the data should be retrieved
    :param package: package for which the data should be retrieved
    :param version: package version for which the data should be retrieved
    """
    rdb = StoragePool.get_connected_storage('BayesianPostgres')
    db = rdb.session
    try:
        e = Ecosystem.by_name(db, ecosystem)
        p = Package.by_name(db, package)
        v = db.query(Version).join(Package).join(Ecosystem). \
            filter(Package.ecosystem == e). \
            filter(Version.package == p). \
            filter(Version.identifier == version).one()

        stat = db.query(WorkerResult.worker, WorkerResult.error, WorkerResult.task_result). \
            join(Analysis).join(Version). \
            filter(Analysis.version == v).all()

        worker_stats = []
        for worker_name, has_error, task_result in stat:
            entry = {
                "worker_name": worker_name,
                "has_error": has_error,
                "task_result": task_result
            }
            worker_stats.append(entry)

        result = {
            "summary": {
                "ecosystem": e.name,
                "package": p.name,
                "version": v.identifier,
                "workers": worker_stats
            }
        }
    except NoResultFound as e:
        return {
            "error":
            "No such version: %s/%s/%s" % (ecosystem, package, version)
        }
    except SQLAlchemyError as e:
        result = {
            "error":
            "Error encountered while fetching data. Please check logs."
        }
    return result
Example #18
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        if not arguments.get('force_graph_sync'):
            self._strict_assert(arguments.get('document_id'))

        rdb = StoragePool.get_connected_storage('BayesianPostgres')
        ecosystem_backend = Ecosystem.by_name(
            rdb.session, arguments.get('ecosystem')).backend.name
        package_list = [{
            'ecosystem': ecosystem_backend,
            'name': arguments['name'],
            'version': arguments.get('version'),
            'source_repo': arguments.get('ecosystem')
        }]

        # If we force graph sync, sync all task results, otherwise only
        # finished in this analysis run
        if not arguments.get('force_graph_sync'):
            # Tasks that need sync to graph start lowercase.
            param = {
                'select_ingest': [
                    task_name
                    for task_name in self.storage.get_finished_task_names(
                        arguments['document_id']) if task_name[0].islower()
                ],
                'package_list':
                package_list
            }
            endpoint = self._SELECTIVE_API_URL
        else:
            param = package_list
            endpoint = self._INGEST_API_URL

        self.log.info("Invoke graph importer at url: '%s' for %s", endpoint,
                      param)
        response = requests.post(endpoint, json=param)

        if response.status_code != 200:
            raise RuntimeError("Failed to invoke graph import at '%s' for %s" %
                               (endpoint, param))

        self.log.info("Graph import succeeded with response: %s",
                      response.text)
Example #19
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        rdb = StoragePool.get_connected_storage('BayesianPostgres')
        ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem'))

        if arguments['ecosystem'] in ('maven', 'pypi', 'npm'):
            return self._victims_scan(arguments, ecosystem)
        elif arguments['ecosystem'] == 'nuget':
            return self._nuget_scan(arguments)
        else:
            raise RequestError('Unsupported ecosystem')
    def execute(self, arguments):
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('ecosystem'))

        # get rid of version if scheduled from the core analyses
        arguments.pop('version', None)

        db = self.storage.session
        ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        package = Package.get_or_create(db,
                                        ecosystem_id=ecosystem.id,
                                        name=arguments['name'])
        upstream = self.get_upstream_entry(db, package,
                                           self.get_upstream_url(arguments))
        arguments['url'] = upstream.url

        if not arguments.get('force'):
            # can potentially schedule two flows of a same type at the same
            # time as there is no lock, but let's say it's OK
            if upstream.updated_at is not None \
                    and upstream.updated_at - datetime.datetime.now() < self._UPDATE_INTERVAL:
                self.log.info(
                    'Skipping upstream package check as data are considered as recent - '
                    'last update %s.', upstream.updated_at)
                # keep track of start, but do not schedule nothing more
                # discard changes like updates
                db.rollback()
                return arguments

        # if this fails, it's actually OK, as there could be concurrency
        package_analysis = PackageAnalysis(package_id=package.id,
                                           started_at=datetime.datetime.now(),
                                           finished_at=None)
        db.add(package_analysis)

        # keep track of updates
        upstream.updated_at = datetime.datetime.now()

        db.commit()
        arguments['document_id'] = package_analysis.id
        return arguments
Example #21
0
    def get_analysis_count(ecosystem, package):
        """Get count of previously scheduled analyses for given ecosystem-package.

        :param ecosystem: str, Ecosystem name
        :param package: str, Package name
        :return: analysis count
        """
        if Ecosystem.by_name(PostgresBase.session, ecosystem).is_backed_by(EcosystemBackend.maven):
            package = MavenCoordinates.normalize_str(package)

        try:
            count = PostgresBase.session.query(PackageAnalysis).\
                                         join(Package).join(Ecosystem).\
                                         filter(Ecosystem.name == ecosystem).\
                                         filter(Package.name == package).\
                                         count()
        except SQLAlchemyError:
            PostgresBase.session.rollback()
            raise

        return count
    def post():
        """Handle the POST REST API call."""
        input_json = request.get_json()
        if not request.json:
            raise HTTPError(400, error="Expected JSON request")

        stack_id = input_json.get('stack_id')
        recommendation_type = input_json.get('recommendation_type')
        package_name = input_json.get('package_name')
        feedback_type = input_json.get('feedback_type')
        ecosystem_name = input_json.get('ecosystem')
        conditions = [
            is_valid(stack_id),
            is_valid(recommendation_type),
            is_valid(package_name),
            is_valid(feedback_type),
            is_valid(ecosystem_name)
        ]
        if not all(conditions):
            raise HTTPError(400, error="Expected parameters missing")
        # Insert in a single commit. Gains - a) performance, b) avoid insert inconsistencies
        # for a single request
        try:
            ecosystem_obj = Ecosystem.by_name(rdb.session, name=ecosystem_name)
            req = RecommendationFeedback(
                stack_id=stack_id,
                package_name=package_name,
                recommendation_type=recommendation_type,
                feedback_type=feedback_type,
                ecosystem_id=ecosystem_obj.id)
            rdb.session.add(req)
            rdb.session.commit()
            return {'status': 'success'}
        except SQLAlchemyError as e:
            # TODO: please log the actual error too here
            logger.exception('Failed to create new analysis request')
            raise HTTPError(
                500, "Error inserting log for request {t}".format(
                    t=stack_id)) from e
Example #23
0
    def get_ecosystem(self, name):
        """Get ecosystem by name."""
        if not self.is_connected():
            self.connect()

        return Ecosystem.by_name(PostgresBase.session, name)
Example #24
0
    def execute(self, arguments, db, manifests, source=None):
        """Dependency finder logic."""
        # TODO: reduce cyclomatic complexity
        # If we receive a manifest file we need to save it first
        result = []
        for manifest in manifests:
            content_hash = None
            if source == 'osio':
                content_hash = generate_content_hash(manifest['content'])
                current_app.logger.info("{} file digest is {}".format(manifest['filename'],
                                                                      content_hash))

                s3 = AmazonS3(bucket_name='boosters-manifest')
                try:
                    s3.connect()
                    manifest['content'] = s3.retrieve_blob(content_hash).decode('utf-8')
                except ClientError as e:
                    current_app.logger.error("Unexpected error while retrieving S3 data: %s" % e)
                    raise

            with TemporaryDirectory() as temp_path:
                with open(os.path.join(temp_path, manifest['filename']), 'a+') as fd:
                    fd.write(manifest['content'])

                # mercator-go does not work if there is no package.json
                if 'shrinkwrap' in manifest['filename'].lower():
                    with open(os.path.join(temp_path, 'package.json'), 'w') as f:
                        f.write(json.dumps({}))

                # Create instance manually since stack analysis is not handled by dispatcher
                subtask = MercatorTask.create_test_instance(task_name='metadata')
                arguments['ecosystem'] = manifest['ecosystem']
                out = subtask.run_mercator(arguments, temp_path, resolve_poms=False)

            if not out["details"]:
                raise FatalTaskError("No metadata found processing manifest file '{}'"
                                     .format(manifest['filename']))

            if 'dependencies' not in out['details'][0] and out.get('status', None) == 'success':
                raise FatalTaskError("Dependencies could not be resolved from manifest file '{}'"
                                     .format(manifest['filename']))

            out["details"][0]['manifest_file'] = manifest['filename']
            out["details"][0]['ecosystem'] = manifest['ecosystem']
            out["details"][0]['manifest_file_path'] = manifest.get('filepath',
                                                                   'File path not available')

            # If we're handling an external request we need to convert dependency specifications to
            # concrete versions that we can query later on in the `AggregatorTask`
            manifest_descriptor = get_manifest_descriptor_by_filename(manifest['filename'])
            if 'external_request_id' in arguments:
                manifest_dependencies = []
                if manifest_descriptor.has_resolved_deps:  # npm-shrinkwrap.json, pom.xml
                    if "_dependency_tree_lock" in out["details"][0]:  # npm-shrinkwrap.json
                        if 'dependencies' in out['details'][0]["_dependency_tree_lock"]:
                            manifest_dependencies = out["details"][0]["_dependency_tree_lock"].get(
                                "dependencies", [])
                    else:  # pom.xml
                        if 'dependencies' in out['details'][0]:
                            manifest_dependencies = out["details"][0].get("dependencies", [])
                    if manifest_descriptor.has_recursive_deps:  # npm-shrinkwrap.json
                        def _flatten(deps, collect):
                            for dep in deps:
                                collect.append({'package': dep['name'], 'version': dep['version']})
                                _flatten(dep['dependencies'], collect)
                        resolved_deps = []
                        _flatten(manifest_dependencies, resolved_deps)
                    else:  # pom.xml
                        resolved_deps =\
                            [{'package': x.split(' ')[0], 'version': x.split(' ')[1]}
                             for x in manifest_dependencies]
                else:  # package.json, requirements.txt
                    try:
                        resolved_deps = self._handle_external_deps(
                            Ecosystem.by_name(db, arguments['ecosystem']),
                            out["details"][0]["dependencies"])
                    except Exception:
                        raise

                out["details"][0]['_resolved'] = resolved_deps
            result.append(out)

        return {'result': result}
Example #25
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        self._strict_assert(arguments.get('ecosystem'))

        # make sure we store package name based on ecosystem package naming case sensitivity
        arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name'])

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem'])

        p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name'])
        v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version'])

        if not arguments.get('force'):
            # TODO: this is OK for now, but if we will scale and there will be
            # 2+ workers running this task they can potentially schedule two
            # flows of a same type at the same time
            if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0:
                # we need to propagate flags that were passed to flow, but not
                # E/P/V - this way we are sure that for example graph import is
                # scheduled (arguments['force_graph_sync'] == True)
                arguments.pop('name')
                arguments.pop('version')
                arguments.pop('ecosystem')
                self.log.debug("Arguments returned by initAnalysisFlow without force: {}"
                               .format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path
                )
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(cache_path, ecosystem,
                                                                    arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'.
                            format(n=arguments.get('name'),
                                   v=arguments.get('version'),
                                   err=str(e))
                        )

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)

        a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(isinstance(arguments.get('ecosystem'), str))
        self._strict_assert(isinstance(arguments.get('name'), str))
        self._strict_assert(isinstance(arguments.get('version'), str))

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])

        # make sure we store package name in its normalized form
        arguments['name'] = normalize_package_name(ecosystem.backend.name,
                                                   arguments['name'])

        if len(pattern_ignore.findall(arguments['version'])) > 0:
            self.log.info("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))
            raise NotABugFatalTaskError("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))

        # Dont try ingestion for private packages
        if is_pkg_public(arguments['ecosystem'], arguments['name']):
            self.log.info("Ingestion flow for {} {}".format(
                arguments['ecosystem'], arguments['name']))
        else:
            self.log.info("Private package ingestion ignored {} {}".format(
                arguments['ecosystem'], arguments['name']))
            raise NotABugFatalTaskError("Private package alert {} {}".format(
                arguments['ecosystem'], arguments['name']))

        p = Package.get_or_create(db,
                                  ecosystem_id=ecosystem.id,
                                  name=arguments['name'])
        v = Version.get_or_create(db,
                                  package_id=p.id,
                                  identifier=arguments['version'])

        if not arguments.get('force'):
            if db.query(Analysis).filter(
                    Analysis.version_id == v.id).count() > 0:
                arguments['analysis_already_exists'] = True
                self.log.debug(
                    "Arguments returned by initAnalysisFlow without force: {}".
                    format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)
        npm_dir = self.configuration.NPM_DATA_DIR

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path)
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(
                            cache_path, ecosystem, arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'
                            .format(n=arguments.get('name'),
                                    v=arguments.get('version'),
                                    err=str(e)))

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(
                        cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)
            if arguments['ecosystem'] == "npm":
                shutil.rmtree(npm_dir, True)

        a = Analysis(version=v,
                     access_count=1,
                     started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug(
            "Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        eco = arguments['ecosystem']
        pkg = arguments['name']
        tool_responses = {}
        result_summary = {
            'package_names': [],
            'registered_srpms': [],
            'all_rhn_channels': [],
            'all_rhsm_content_sets': [],
            'all_rhsm_product_names': []
        }
        result_data = {'status': 'error',
                       'summary': result_summary,
                       'details': tool_responses
                       }

        # bail out early; we need access to internal services or the package is
        # from Maven ecosystem, otherwise we can't comment on downstream usage
        is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven)
        if not self._is_inside_rh() and not is_maven:
            return result_data

        self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg))
        res = self._fetch_anitya_project(eco, pkg)
        anitya_rpm_names = []
        anitya_mvn_names = []
        if res is None:
            result_data['status'] = 'error'
        elif res.status_code == 200:
            self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg))
            anitya_response = res.json()
            tool_responses['redhat_anitya'] = anitya_response
            # For now, we assume all downstreams are ones we care about
            for entry in anitya_response['packages']:
                if entry['distro'] == RH_RPM_DISTRO_NAME:
                    anitya_rpm_names.append(entry['package_name'])
                elif entry['distro'] == RH_MVN_DISTRO_NAME:
                    anitya_mvn_names.append(entry['package_name'])
                else:
                    self.log.warning(
                        'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'.
                        format(d=entry['distro'], o=entry['package_name'], p=pkg)
                    )
            self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names))
            self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names))
            # TODO: Report 'partial' here and switch to 'success' at the end
            result_data['status'] = 'success'
        else:
            msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}'
            self.log.error(msg.format(e=eco, p=pkg, r=res.text))
            result_data['status'] = 'error'

        if self._is_inside_rh():
            # we have candidate downstream name mappings, check them against Brew
            seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)]
            self.log.debug('Checking candidate names in Brew: {}'.format(seed_names))

            args = ['brew-utils-cli', '--version', arguments['version']]
            artifact_hash = self._get_artifact_hash(algorithm='sha256')
            if artifact_hash:
                args += ['--digest', artifact_hash]
            args += seed_names

            self.log.debug("Executing command, timeout={timeout}: {cmd}".format(
                timeout=self._BREWUTILS_CLI_TIMEOUT,
                cmd=args))
            tc = TimedCommand(args)
            status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT)
            self.log.debug("status = %s, error = %s", status, error)
            output = ''.join(output)
            self.log.debug("output = %s", output)
            if not output:
                raise TaskError("Error running command %s" % args)
            brew = json.loads(output)

            result_summary['package_names'] = brew['packages']
            result_summary['registered_srpms'] = brew['response']['registered_srpms']
            tool_responses['brew'] = brew['response']['brew']

            # we have SRPM details, fetch details on where the RPMs are shipped
            tool_responses['pulp_cdn'] = pulp_responses = []
            rhn_channels = set()
            rhsm_content_sets = set()
            rhsm_product_names = set()
            for srpm_summary in result_summary['registered_srpms']:
                srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'],
                                                             v=srpm_summary['version'],
                                                             r=srpm_summary['release'])
                cdn_metadata = self._get_cdn_metadata(srpm_filename)
                if cdn_metadata is None:
                    msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}'
                    self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename))
                    continue
                pulp_responses.append(cdn_metadata)
                srpm_summary['published_in'] = cdn_metadata['rhsm_product_names']
                rhn_channels.update(cdn_metadata['rhn_channels'])
                rhsm_content_sets.update(cdn_metadata['rhsm_content_sets'])
                rhsm_product_names.update(cdn_metadata['rhsm_product_names'])
            result_summary['all_rhn_channels'] = sorted(rhn_channels)
            result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets)
            result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names)

        self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version'])

        return result_data
    def components_to_scan(self, previous_sync_timestamp,
                           only_already_scanned):
        """Get EPV that were recently updated in OSS Index, so they can contain new vulnerabilities.

        Get components (e:p:v) that were recently (since previous_sync_timestamp) updated
        in OSS Index, which means that they can contain new vulnerabilities.

        :param previous_sync_timestamp: timestamp of previous check
        :param only_already_scanned: include already scanned components only
        :return: generator of e:p:v
        """
        # TODO: reduce cyclomatic complexity
        to_scan = []
        rdb = StoragePool.get_connected_storage('BayesianPostgres')

        for ecosystem in ['nuget']:
            ecosystem_solver = get_ecosystem_solver(
                self.storage.get_ecosystem(ecosystem),
                with_parser=OSSIndexDependencyParser())
            self.log.debug("Retrieving new %s vulnerabilities from OSS Index",
                           ecosystem)
            ossindex_updated_packages = CVEcheckerTask.\
                query_ossindex_vulnerability_fromtill(ecosystem=ecosystem,
                                                      from_time=previous_sync_timestamp)
            for ossindex_updated_package in ossindex_updated_packages:
                if Ecosystem.by_name(rdb.session, ecosystem).is_backed_by(
                        EcosystemBackend.maven):
                    package_name = "{g}:{n}".format(
                        g=ossindex_updated_package['group'],
                        n=ossindex_updated_package['name'])
                else:
                    package_name = ossindex_updated_package['name']
                package_affected_versions = set()
                for vulnerability in ossindex_updated_package.get(
                        'vulnerabilities', []):
                    for version_string in vulnerability.get('versions', []):
                        try:
                            resolved_versions = ecosystem_solver.\
                                solve(["{} {}".format(package_name, version_string)],
                                      all_versions=True)
                        except Exception:
                            self.log.exception(
                                "Failed to resolve %r for %s:%s",
                                version_string, ecosystem, package_name)
                            continue
                        resolved_versions = resolved_versions.get(
                            package_name, [])
                        if only_already_scanned:
                            already_scanned_versions =\
                                [ver for ver in resolved_versions if
                                 self.storage.get_analysis_count(ecosystem, package_name, ver) > 0]
                            package_affected_versions.update(
                                already_scanned_versions)
                        else:
                            package_affected_versions.update(resolved_versions)

                for version in package_affected_versions:
                    to_scan.append({
                        'ecosystem': ecosystem,
                        'name': package_name,
                        'version': version
                    })
        msg = "Components to be {prefix}scanned for vulnerabilities: {components}".\
            format(prefix="re-" if only_already_scanned else "",
                   components=to_scan)
        self.log.info(msg)
        return to_scan