Example #1
0
    def _processing(self, kronos_bucket=None):
        """Append new data for Kronos training.

        :param kronos_bucket: The source where data is to be added.
        """
        try:
            if kronos_bucket:
                s3 = StoragePool.get_connected_storage('AmazonS3')
                s3.bucket_name = kronos_bucket
            else:
                s3 = StoragePool.get_connected_storage('S3KronosAppend')
            result = self._execute_query(self._generate_query()).fetchall()
            result_len = len(result)
            self.log.info("Query executed.")
            self.log.info("Number of results = {}".format(result_len))
            if result_len > 0:
                for each_row in result:
                    package_list = []
                    if len(each_row) != 2 or each_row[0] != self.ecosystem:
                        continue
                    for dep in each_row[1]:
                        package_name = dep.get('package')
                        if package_name:
                            package_list.append(package_name)
                            self.unique_packages.add(package_name)
                    self.extra_manifest_list.append(package_list)

                self._append_manifest(s3)
                self._append_package_topic(s3)
                self.log.info("User Input Stacks appended.")
        except Exception as e:
            self.log.exception(
                'Unable to append input stack for ecosystem {ecosystem}: {reason}'
                .format(ecosystem=self.ecosystem, reason=str(e)))
Example #2
0
    def post():
        input_json = request.get_json()

        if not input_json:
            raise HTTPError(400, error="Expected JSON request")

        if 'manual_tagging' not in input_json:
            if 'ecosystem' not in input_json:
                raise HTTPError(400, error="Expected ecosystem in the request")

            if 'data' not in input_json:
                raise HTTPError(400, error="Expected data in the request")

            s3 = StoragePool.get_connected_storage('S3UserIntent')

            # Store data
            return s3.store_master_tags(input_json)
        else:
            if 'user' not in input_json:
                raise HTTPError(400, error="Expected user name in the request")

            if 'data' not in input_json:
                raise HTTPError(400, error="Expected tags in the request")

            s3 = StoragePool.get_connected_storage('S3ManualTagging')

            # Store data
            return s3.store_user_data(input_json)
Example #3
0
    def _package_level_keywords(self, keywords_file_name, stopwords_file_name, arguments):
        """Compute package level keywords."""
        # Keep f8a_tagger import local as other components dependent on
        # f8a_worker do not require it installed.
        from f8a_tagger import lookup_readme as keywords_lookup_readme
        from f8a_tagger import lookup_text as keywords_lookup_text

        details = {}
        package_postgres = StoragePool.get_connected_storage('PackagePostgres')

        gh_info = package_postgres.get_task_result_by_analysis_id(arguments['ecosystem'],
                                                                  arguments['name'],
                                                                  'github_details',
                                                                  arguments['document_id'])
        if gh_info:
            self.log.debug("Aggregating explicitly stated keywords (topics) on GitHub")
            details['gh_topics'] = gh_info.get('details', {}).get('topics', [])

        s3_readme = StoragePool.get_connected_storage('S3Readme')
        try:
            readme_json = s3_readme.retrieve_readme_json(arguments['ecosystem'], arguments['name'])
            if readme_json:
                self.log.debug("Computing keywords from README.json")
                details['README'] = keywords_lookup_readme(readme_json,
                                                           keywords_file=keywords_file_name,
                                                           stopwords_file=stopwords_file_name,
                                                           **self._LOOKUP_CONF)
        except Exception as exc:
            self.log.info("Failed to retrieve README: %s", str(exc))

        s3_rd = StoragePool.get_connected_storage('S3RepositoryDescription')
        try:
            description = s3_rd.retrieve_repository_description(arguments['ecosystem'],
                                                                arguments['name'])
            if description:
                self.log.debug("Computing keywords on description from repository")
                details['repository_description'] = keywords_lookup_text(
                    description,
                    keywords_file=keywords_file_name,
                    stopwords_file=stopwords_file_name,
                    **self._LOOKUP_CONF)

        except Exception as exc:
            self.log.info("Failed to retrieve repository description: %s", str(exc))

        if self.task_name == 'package_keywords_tagging':
            # We are tagging on package level, add also tags that are found in package name
            name_parts = re.split(r'[.\-_:]', arguments['name'])
            self.log.debug("Computing keywords from package name %s", name_parts)
            details['package_name'] = keywords_lookup_text(" ".join(name_parts),
                                                           keywords_file=keywords_file_name,
                                                           stopwords_file=stopwords_file_name,
                                                           **self._LOOKUP_CONF)

        return details
    def run(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('document_id'))

        postgres = StoragePool.get_connected_storage('PackagePostgres')
        results = postgres.get_analysis_by_id(arguments['document_id'])

        return self.do_run(arguments,
                           StoragePool.get_connected_storage('S3PackageData'),
                           postgres, results)
Example #5
0
 def wrapper(*args, **kwargs):
     """Wrap sqlalchemy code into try-except."""
     try:
         data = func(*args, **kwargs)
         result = {"summary": data}
     except NoResultFound:
         result = {"error": "No result found."}
     except SQLAlchemyError:
         StoragePool.get_connected_storage('BayesianPostgres').session.rollback()
         result = {"error": "SQLAlchemyError encountered while fetching data. "
                            "Roll-backing. Try again."}
     return result
Example #6
0
    def store_error(self, node_args, flow_name, task_name, task_id, exc_info):
        """Store error to Postgres/RDS so we know about task failures that store data on S3."""
        if flow_name == 'bayesianAnalysisFlow':
            postgres = StoragePool.get_connected_storage('BayesianPostgres')
        elif flow_name == 'bayesianPackageAnalysisFlow':
            postgres = StoragePool.get_connected_storage('PackagePostgres')
        else:
            raise RuntimeError(
                "Unable to store error, error storing not defined for flow '%s'"
                % flow_name)

        return postgres.store_error(node_args, flow_name, task_name, task_id,
                                    exc_info)
Example #7
0
    def run(self, arguments):
        """Run task."""
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        self._strict_assert(arguments.get('document_id'))

        postgres = StoragePool.get_connected_storage('BayesianPostgres')
        results = postgres.get_analysis_by_id(arguments['document_id'])

        return self.do_run(arguments,
                           StoragePool.get_connected_storage('S3Data'),
                           postgres,
                           results)
    def run(self, arguments):
        """Run task."""
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('repo_name'))
        self._strict_assert(arguments.get('document_id'))

        s3 = StoragePool.get_connected_storage('S3GitHubManifestMetadata')
        postgres = StoragePool.get_connected_storage('BayesianPostgres')

        results = postgres.get_analysis_by_id(arguments['document_id'])
        for worker_result in results.raw_analyses:

            # Skip auxiliary tasks (e.g. InitGitHubManifestMetadata)
            if worker_result.worker[0].isupper():
                continue

            # Retrieve raw manifest file and store it in S3
            if worker_result.worker == 'metadata':
                task_result = worker_result.task_result
                for detail in task_result.get('details', []):
                    if detail.get('path', None):
                        manifest_url = urllib.parse.urljoin(
                            self.GITHUB_CONTENT_URL,
                            arguments['repo_name'] + '/' + detail['path'])
                        manifest_name = detail['path'].split('/', 1)[1]
                        response = requests.get(manifest_url)
                        if response.status_code == 200:
                            s3.store_raw_manifest(arguments['ecosystem'],
                                                  arguments['repo_name'],
                                                  manifest_name,
                                                  response.content)
                        else:
                            self.log.error(
                                'Unable to retrieve manifest file from %s',
                                manifest_url)
                            continue

            result_name = worker_result.worker
            if result_name.startswith('gh_most_starred_'):
                result_name = result_name[len('gh_most_starred_'):]
            version_id = s3.store(arguments, self.flow_name, self.task_name,
                                  self.task_id,
                                  (result_name, worker_result.task_result))
            worker_result.task_result = {'version_id': version_id}
        try:
            postgres.session.commit()
        except SQLAlchemyError:
            postgres.session.rollback()
            raise
    def execute(self, ecosystem):
        """Process raw-tags and update existing package_topic.json file on S3.

        :param ecosystem: Name of ecosystem
        """
        s3 = StoragePool.get_connected_storage('S3CrowdSourceTags')

        package_topic = []
        try:
            package_topic = s3.retrieve_package_topic(ecosystem)
        except ClientError:
            self.log.error("Unable to retrieve package_topic.json for %s",
                           ecosystem)

        results = {}
        for record in package_topic:
            if record.get("ecosystem") == ecosystem and record.get(
                    "package_topic_map"):
                results = record["package_topic_map"]
        if not results:
            self.log.error("Unable to retrieve package_topic_map for %s",
                           ecosystem)

        results = self._update_tags_from_graph(ecosystem=ecosystem,
                                               results=results)

        s3.store_package_topic(ecosystem, results)
        self.log.debug(
            "The file crowd_sourcing_package_topic.json "
            "has been stored for %s", ecosystem)
Example #10
0
def get_component_percentile_rank(ecosystem_backend,
                                  package,
                                  version,
                                  db_session=None):
    """Get component's percentile rank.

    :param ecosystem_backend: str, Ecosystem backend from `cucoslib.enums.EcosystemBackend`
    :param package: str, Package name
    :param version: str, Package version
    :param db_session: obj, Database session to use for querying
    :return: component's percentile rank, or -1 if the information is not available
    """

    try:
        if not db_session:
            storage = StoragePool.get_connected_storage("BayesianPostgres")
            db_session = storage.session

        rank = db_session.query(ComponentGHUsage.percentile_rank) \
            .filter(ComponentGHUsage.name == package) \
            .filter(ComponentGHUsage.version == version) \
            .filter(ComponentGHUsage.ecosystem_backend == ecosystem_backend) \
            .order_by(desc(ComponentGHUsage.timestamp)) \
            .first()
    except SQLAlchemyError:
        epv = '{e}/{p}/{v}'.format(e=ecosystem_backend, p=package, v=version)
        logger.exception(
            'Unable to retrieve percentile_rank for {epv}'.format(epv=epv))
        return -1

    if rank is None or len(rank) == 0:
        return 0

    return rank[0]
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        eco = arguments['ecosystem']
        pkg = arguments['name']
        ver = arguments['version']

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception:
            if not Ecosystem.by_name(
                    StoragePool.get_connected_storage('BayesianPostgres').
                    session, eco).is_backed_by(EcosystemBackend.maven):
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = self.run_scancode(cache_path)
        return result_data
Example #12
0
def get_analysis_by_id(ecosystem,
                       package,
                       version,
                       analysis_id,
                       db_session=None):
    """Get result of previously scheduled analysis for given EPV triplet by analysis ID

    :param ecosystem: str, Ecosystem name
    :param package: str, Package name
    :param version: str, Package version
    :param analysis_id: str, ID of analysis
    :param db_session: obj, Database session to use for querying
    :return: analysis result
    """
    if not db_session:
        storage = StoragePool.get_connected_storage("BayesianPostgres")
        db_session = storage.session

    if ecosystem == 'maven':
        package = MavenCoordinates.normalize_str(package)

    found = db_session.query(Analysis).\
        join(Version).join(Package).join(Ecosystem).\
        filter(Ecosystem.name == ecosystem).\
        filter(Package.name == package).\
        filter(Version.identifier == version).\
        filter(Analysis.id == analysis_id).\
        one()

    return found
    def _use_maven_index_checker(self):
        maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH')
        target_dir = os.path.join(maven_index_checker_dir, 'target')

        s3 = StoragePool.get_connected_storage('S3MavenIndex')
        self.log.info('Fetching pre-built maven index from S3, if available.')
        s3.retrieve_index_if_exists(target_dir)

        index_range = '{}-{}'.format(self.count.min, self.count.max)
        command = [
            'java', '-Xmx768m', '-jar', 'maven-index-checker.jar', '-r',
            index_range
        ]
        with cwd(maven_index_checker_dir):
            output = TimedCommand.get_command_output(command,
                                                     is_json=True,
                                                     graceful=False,
                                                     timeout=1200)
            for idx, release in enumerate(output):
                name = '{}:{}'.format(release['groupId'],
                                      release['artifactId'])
                version = release['version']
                self.log.info("Scheduling #%d.", self.count.min + idx)
                self.analyses_selinon_flow(name, version)
        # index checker should clean up these dirs in /temp/ after itself, but better be sure
        for mindexerdir in glob.glob(
                os.path.join(gettempdir(), 'mindexer-ctxcentral-context*')):
            rmtree(mindexerdir)

        self.log.info('Storing pre-built maven index to S3')
        s3.store_index(target_dir)
        central_index_dir = os.path.join(target_dir, 'central-index')
        rmtree(central_index_dir)
Example #14
0
def get_dependents_count(ecosystem_backend, package, version, db_session=None):
    """Get number of GitHub projects dependent on given (package, version).

    :param ecosystem_backend: str, Ecosystem backend from `f8a_worker.enums.EcosystemBackend`
    :param package: str, Package name
    :param version: str, Package version
    :param db_session: obj, Database session to use for querying
    :return: number of dependent projects, or -1 if the information is not available
    """
    if not db_session:
        storage = StoragePool.get_connected_storage("BayesianPostgres")
        db_session = storage.session

    try:
        count = db_session.query(ComponentGHUsage.count) \
                          .filter(ComponentGHUsage.name == package) \
                          .filter(ComponentGHUsage.version == version) \
                          .filter(ComponentGHUsage.ecosystem_backend == ecosystem_backend) \
                          .order_by(desc(ComponentGHUsage.timestamp)) \
                          .first()
    except SQLAlchemyError:
        db_session.rollback()
        raise

    if count:
        return count[0]
    return -1
def retrieve_bookkeeping_for_ecosystem(ecosystem):
    """Retrieve BookKeeping data for given Ecosystem.

    :param ecosystem: ecosystem for which the data should be retrieved
    """
    rdb = StoragePool.get_connected_storage('BayesianPostgres')
    db = rdb.session
    try:
        e = Ecosystem.by_name(db, ecosystem)
        package_count = _count(
            db,
            db.query(Package).filter(Package.ecosystem == e))
        pv_count = _count(
            db,
            db.query(Version).join(Package).filter(Package.ecosystem == e))
        result = {
            "summary": {
                "ecosystem": e.name,
                "package_count": package_count,
                "package_version_count": pv_count
            }
        }
    except NoResultFound as e:
        result = {"error": "No such ecosystem: %s" % ecosystem}
    except SQLAlchemyError as e:
        result = {
            "error":
            "Error encountered while fetching data. Please check logs."
        }

    return result
Example #16
0
def get_component_percentile_rank(ecosystem_backend,
                                  package,
                                  version,
                                  db_session=None):
    """Get component's percentile rank.

    :param ecosystem_backend: str, Ecosystem backend from `f8a_worker.enums.EcosystemBackend`
    :param package: str, Package name
    :param version: str, Package version
    :param db_session: obj, Database session to use for querying
    :return: component's percentile rank, or -1 if the information is not available
    """
    if not db_session:
        storage = StoragePool.get_connected_storage("BayesianPostgres")
        db_session = storage.session

    try:
        rank = db_session.query(ComponentGHUsage.percentile_rank) \
            .filter(ComponentGHUsage.name == package) \
            .filter(ComponentGHUsage.version == version) \
            .filter(ComponentGHUsage.ecosystem_backend == ecosystem_backend) \
            .order_by(desc(ComponentGHUsage.timestamp)) \
            .first()
    except SQLAlchemyError:
        db_session.rollback()
        raise

    if rank:
        return rank[0]
    return 0
Example #17
0
def iter_unknown_dependencies(storage_pool, node_args):
    """Collect unknown dependencies."""
    # Be safe here as fatal errors will cause errors in Dispatcher
    try:
        aggregated = storage_pool.get('UnknownDependencyFetcherTask')

        arguments = []
        for element in aggregated["result"]:
            epv = element.split(':')
            ecosystem = epv[0]
            if Ecosystem.by_name(
                    StoragePool.get_connected_storage(
                        'BayesianPostgres').session,
                    ecosystem).is_backed_by(EcosystemBackend.maven):
                name = '{}:{}'.format(epv[1], epv[2])
                version = epv[3]
            else:
                name = epv[1]
                version = epv[2]
            analysis_arguments = _create_analysis_arguments(
                ecosystem, name, version)
            # TODO: Remove force=True once data-importer is smart enough
            # to ingest missing packages from s3.
            analysis_arguments.update({"recursive_limit": 0, "force": True})
            arguments.append(analysis_arguments)

        print('Arguments appended: %s' %
              ', '.join(str(item) for item in arguments))
        logger.info("Arguments for next flows: %s" % str(arguments))
        return arguments
    except Exception as e:
        logger.exception(
            "Failed to collect unknown dependencies due to {}".format(e))
        return []
def retrieve_bookkeeping_all():
    """Retrieve BookKeeping data for all Ecosystems."""
    rdb = StoragePool.get_connected_storage('BayesianPostgres')
    db = rdb.session
    try:
        data = []
        for e in db.query(Ecosystem).all():
            package_count = _count(
                db,
                db.query(Package).filter(Package.ecosystem == e))
            ecosystem_name = db.query(Ecosystem).get(e.id).name
            pv_count = _count(
                db,
                db.query(Version).join(Package).filter(Package.ecosystem == e))
            entry = {
                "name": ecosystem_name,
                "package_count": package_count,
                "package_version_count": pv_count
            }
            data.append(entry)

        result = {"summary": data}

    except SQLAlchemyError as e:
        result = {
            "error":
            "Error encountered while fetching data. Please check logs."
        }

    return result
Example #19
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))

        rdb_session = StoragePool.get_connected_storage(
            'BayesianPostgres').session

        name = arguments['name']
        ecosystem = arguments['ecosystem']
        if ecosystem == 'go':
            name = quote(name, safe='')

        project_url = self.configuration.libraries_io_project_url(
            Ecosystem.by_name(rdb_session, ecosystem), name)
        project = get_response(project_url)
        versions = project['versions']
        details = {
            'dependent_repositories': {
                'count': project['dependent_repos_count']
            },
            'dependents': {
                'count': project['dependents_count']
            },
            'releases': {
                'count': len(versions),
                'recent': self.recent_releases(versions)
            }
        }

        return {'status': 'success', 'summary': [], 'details': details}
 def __init__(self, job_id):
     """Construct the instance of the handler class for given job id."""
     self.log = logging.getLogger(__name__)
     self.job_id = job_id
     # initialize always as the assumption is that we will use it
     self._init_celery()
     self.postgres = StoragePool.get_connected_storage('BayesianPostgres')
Example #21
0
    def _npm_scan(self, arguments):
        """
        Query Snyk vulndb stored on S3
        """
        s3 = StoragePool.get_connected_storage('S3Snyk')

        try:
            self.log.debug('Retrieving Snyk vulndb from S3')
            vulndb = s3.retrieve_vulndb()
        except:
            self.log.error('Failed to obtain Snyk vulndb database')
            return {'summary': ['Failed to obtain Snyk vulndb database'],
                    'status': 'error',
                    'details': []}

        entries = []
        solver = get_ecosystem_solver(self.storage.get_ecosystem('npm'))
        for entry in vulndb.get('npm', {}).get(arguments['name'], []):
            vulnerable_versions = entry['semver']['vulnerable']
            affected_versions = solver.solve(["{} {}".format(arguments['name'],
                                                             vulnerable_versions)],
                                             all_versions=True)
            if arguments['version'] in affected_versions.get(arguments['name'], []):
                entries.append(self._filter_vulndb_fields(entry))

        return {'summary': [e['id'] for e in entries if e],
                'status': 'success',
                'details': entries}
Example #22
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('external_request_id'))

        postgres = StoragePool.get_connected_storage('BayesianPostgres')

        try:
            results = postgres.session.query(StackAnalysisRequest)\
                        .filter(StackAnalysisRequest.id == arguments.get('external_request_id'))\
                        .first()
        except SQLAlchemyError:
            postgres.session.rollback()
            raise

        manifests = []
        if results is not None:
            row = results.to_dict()
            request_json = row.get("requestJson", {})
            manifests = request_json.get('manifest', [])

        return {'manifest': manifests}
    def _run_victims_cve_db_cli(self, arguments):
        """Run Victims CVE DB CLI."""
        s3 = StoragePool.get_connected_storage('S3VulnDB')
        output = []

        with TemporaryDirectory() as temp_victims_db_dir:
            if not s3.retrieve_victims_db_if_exists(temp_victims_db_dir):
                self.log.debug('No Victims CVE DB found on S3, cloning from github')
                self.update_victims_cve_db_on_s3()
                s3.retrieve_victims_db_if_exists(temp_victims_db_dir)

            try:
                cli = os.path.join(temp_victims_db_dir, 'victims-cve-db-cli.py')
                command = [cli, 'search',
                           '--ecosystem', 'java',
                           '--name', arguments['name'],
                           '--version', arguments['version']]
                output = TimedCommand.get_command_output(command,
                                                         graceful=False,
                                                         is_json=True,
                                                         timeout=60)  # 1 minute
            except TaskError as e:
                self.log.exception(e)

        return output
Example #24
0
    def execute(self, arguments):
        """Task to mark vulnerable packages in graph.

        :param arguments: dictionary with task arguments
        :return: None
        """
        self._strict_assert(arguments.get('ecosystem'))

        wanted_cves = set(arguments.get('cve_filter', []))
        victims_cls = VictimsDB if not wanted_cves else FilteredVictimsDB

        rdb = StoragePool.get_connected_storage('BayesianPostgres')
        ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem'))

        with victims_cls.build_from_git(wanted=wanted_cves) as db:

            self.log.info('Storing the VictimsDB zip on S3')
            db.store_on_s3()

            vulnerable_packages = self.get_vulnerable_packages(db, ecosystem)
            self.create_in_graph(vulnerable_packages, ecosystem)

            self.mark_in_graph(vulnerable_packages, ecosystem)

            self.notify_gemini(vulnerable_packages, ecosystem)
 def update_victims_cve_db_on_s3():
     """Update Victims CVE DB on S3."""
     repo_url = 'https://github.com/victims/victims-cve-db.git'
     s3 = StoragePool.get_connected_storage('S3VulnDB')
     with TemporaryDirectory() as temp_dir:
         Git.clone(repo_url, temp_dir, depth="1")
         s3.store_victims_db(temp_dir)
Example #26
0
    def __init__(self,
                 ecosystem,
                 name,
                 version,
                 cache_dir,
                 is_temporary=False):
        """Initialize all attributes of the EPVCache during object instantiation.

        :param ecosystem: ecosystem for the given EPV
        :param name: name for the given EPV
        :param version: version of the given EPV
        :param cache_dir: path to dir on the filesystem that should be used for caching artifacts
        :param is_temporary: if True, then objects of certain age will be automatically deleted from
                             the underlying storage
        """
        self.ecosystem = ecosystem
        self.name = name
        self.version = version
        self.cache_dir = cache_dir
        self._eco_obj = None
        storage_name = 'S3Artifacts' if not is_temporary else 'S3TempArtifacts'
        self._s3 = StoragePool.get_connected_storage(storage_name)
        self._postgres = StoragePool.get_connected_storage('BayesianPostgres')
        self._base_object_key = "{ecosystem}/{name}/{version}".format(
            ecosystem=ecosystem, name=name, version=version)
        self._extracted_tarball_dir = os.path.join(
            self.cache_dir, self._EXTRACTED_SOURCE_TARBALL_DIR)
        self._extracted_source_jar_dir = os.path.join(
            self.cache_dir, self._EXTRACTED_SOURCE_JAR_DIR)
        self._pom_xml_path = os.path.join(self.cache_dir, self._POM_XML_NAME)
        self._source_jar_path = os.path.join(self.cache_dir,
                                             self._SOURCE_JAR_NAME)
        self._pom_xml_object_key = "{}/{}".format(self._base_object_key,
                                                  self._POM_XML_NAME)
        self._source_jar_object_key = "{}/{}".format(self._base_object_key,
                                                     self._SOURCE_JAR_NAME)

        # Based on actual tarball name which can vary based on ecosystem - see meta.json
        self._source_tarball_path = None
        self._source_tarball_object_key = None

        # Meta-information about artifact
        self._meta = None
        self._meta_json_object_key = "{}/{}".format(self._base_object_key,
                                                    self._META_JSON_NAME)
    def _get_package_level_keywords(self, ecosystem, name):
        """Retrieve all package level keywords for the given package."""
        package_postgres = StoragePool.get_connected_storage('PackagePostgres')

        self.log.debug(
            "Retrieving results of 'keywords_tagging' on package level")
        task_result = package_postgres.get_latest_task_result(
            ecosystem, name, 'keywords_tagging')
        return task_result.get('details', {}) if task_result else {}
    def execute(self, repositories, ecosystem, bucket_name, object_key):
        """ Aggregate package names from GitHub manifests.

        :param repositories: a list of repositories
        :param ecosystem: ecosystem, will appear in the resulting JSON file
        :param bucket_name: name of the bucket where to put the resulting JSON file
        :param object_key: object key of the resulting JSON file
        """

        s3 = StoragePool.get_connected_storage('S3GitHubManifestMetadata')

        package_list = []
        tagger_list = []
        for repo in repositories:

            try:
                repo_ecosystem = repo['ecosystem']
                repo_name = repo['repo_name']
            except ValueError:
                self.log.error('Invalid configuration, skipping: {config}'.format(
                    config=str(repo)))
                continue

            try:
                obj = '{e}/{repo_name}/dependency_snapshot.json'.format(
                    e=repo_ecosystem, repo_name=repo_name.replace('/', ':'))
                dependency_snapshot = s3.retrieve_dict(obj)

                dependencies = dependency_snapshot.get('details', {}).get('runtime', [])

                packages = list({x.get('name') for x in dependencies})
                if packages:
                    package_list.append(packages)

                packages_version = dict([(x.get("name"), x.get("version")) for x in dependencies])
                if packages_version:
                    extracted_tagger_list = self._create_tagger_list(ecosystem, packages_version)
                    for etl in extracted_tagger_list:
                            tagger_list.append(etl)

            except Exception as e:
                self.log.error('Unable to collect dependencies for {repo_name}: {reason}'.format(
                    repo_name=repo_name, reason=str(e)))
                continue

        results = {
            'ecosystem': ecosystem,
            'package_list': package_list
        }

        self.log.info("Storing aggregated list of packages in S3")

        s3_dest = AmazonS3(bucket_name=bucket_name)
        s3_dest.connect()
        s3_dest.store_dict(results, object_key)
        s3_dest.store_dict(tagger_list, "tagger_list" + object_key)
Example #29
0
def _create_analysis_arguments(ecosystem, name, version):
    """Create arguments for analysis."""
    return {
        'ecosystem': ecosystem,
        'name': MavenCoordinates.normalize_str(name) if Ecosystem.by_name(
            StoragePool.get_connected_storage('BayesianPostgres').session,
            ecosystem).is_backed_by(
            EcosystemBackend.maven) else name,
        'version': version
    }
    def execute(self, arguments):
        """

        :param arguments: optional argument 'only_already_scanned' to run only on already analysed packages
        :return: EPV dict describing which packages should be analysed
        """
        only_already_scanned = arguments.pop('only_already_scanned',
                                             True) if arguments else True
        ignore_modification_time = arguments.pop('ignore_modification_time',
                                                 False) if arguments else False
        self._strict_assert(not arguments)

        s3 = StoragePool.get_connected_storage('S3OWASPDepCheck')
        with tempdir() as temp_data_dir:
            s3.retrieve_depcheck_db_if_exists(temp_data_dir)
            self._update_dep_check_db(temp_data_dir)
            s3.store_depcheck_db(temp_data_dir)

        cve_db = self._get_snyk_vulndb()
        s3 = StoragePool.get_connected_storage('S3Snyk')
        s3.store_vulndb(cve_db)
        last_sync_datetime = s3.update_sync_date()

        to_update = []
        for package_name, cve_records in cve_db.get('npm', {}).items():
            for record in cve_records:
                modification_time = datetime_parser.parse(
                    record['modificationTime'])

                if ignore_modification_time or modification_time >= last_sync_datetime:
                    affected_versions = self._get_versions_to_scan(
                        package_name, record['semver']['vulnerable'],
                        only_already_scanned)

                    for version in affected_versions:
                        to_update.append({
                            'ecosystem': 'npm',
                            'name': package_name,
                            'version': version
                        })

        return {'modified': to_update}