def _processing(self, kronos_bucket=None): """Append new data for Kronos training. :param kronos_bucket: The source where data is to be added. """ try: if kronos_bucket: s3 = StoragePool.get_connected_storage('AmazonS3') s3.bucket_name = kronos_bucket else: s3 = StoragePool.get_connected_storage('S3KronosAppend') result = self._execute_query(self._generate_query()).fetchall() result_len = len(result) self.log.info("Query executed.") self.log.info("Number of results = {}".format(result_len)) if result_len > 0: for each_row in result: package_list = [] if len(each_row) != 2 or each_row[0] != self.ecosystem: continue for dep in each_row[1]: package_name = dep.get('package') if package_name: package_list.append(package_name) self.unique_packages.add(package_name) self.extra_manifest_list.append(package_list) self._append_manifest(s3) self._append_package_topic(s3) self.log.info("User Input Stacks appended.") except Exception as e: self.log.exception( 'Unable to append input stack for ecosystem {ecosystem}: {reason}' .format(ecosystem=self.ecosystem, reason=str(e)))
def post(): input_json = request.get_json() if not input_json: raise HTTPError(400, error="Expected JSON request") if 'manual_tagging' not in input_json: if 'ecosystem' not in input_json: raise HTTPError(400, error="Expected ecosystem in the request") if 'data' not in input_json: raise HTTPError(400, error="Expected data in the request") s3 = StoragePool.get_connected_storage('S3UserIntent') # Store data return s3.store_master_tags(input_json) else: if 'user' not in input_json: raise HTTPError(400, error="Expected user name in the request") if 'data' not in input_json: raise HTTPError(400, error="Expected tags in the request") s3 = StoragePool.get_connected_storage('S3ManualTagging') # Store data return s3.store_user_data(input_json)
def _package_level_keywords(self, keywords_file_name, stopwords_file_name, arguments): """Compute package level keywords.""" # Keep f8a_tagger import local as other components dependent on # f8a_worker do not require it installed. from f8a_tagger import lookup_readme as keywords_lookup_readme from f8a_tagger import lookup_text as keywords_lookup_text details = {} package_postgres = StoragePool.get_connected_storage('PackagePostgres') gh_info = package_postgres.get_task_result_by_analysis_id(arguments['ecosystem'], arguments['name'], 'github_details', arguments['document_id']) if gh_info: self.log.debug("Aggregating explicitly stated keywords (topics) on GitHub") details['gh_topics'] = gh_info.get('details', {}).get('topics', []) s3_readme = StoragePool.get_connected_storage('S3Readme') try: readme_json = s3_readme.retrieve_readme_json(arguments['ecosystem'], arguments['name']) if readme_json: self.log.debug("Computing keywords from README.json") details['README'] = keywords_lookup_readme(readme_json, keywords_file=keywords_file_name, stopwords_file=stopwords_file_name, **self._LOOKUP_CONF) except Exception as exc: self.log.info("Failed to retrieve README: %s", str(exc)) s3_rd = StoragePool.get_connected_storage('S3RepositoryDescription') try: description = s3_rd.retrieve_repository_description(arguments['ecosystem'], arguments['name']) if description: self.log.debug("Computing keywords on description from repository") details['repository_description'] = keywords_lookup_text( description, keywords_file=keywords_file_name, stopwords_file=stopwords_file_name, **self._LOOKUP_CONF) except Exception as exc: self.log.info("Failed to retrieve repository description: %s", str(exc)) if self.task_name == 'package_keywords_tagging': # We are tagging on package level, add also tags that are found in package name name_parts = re.split(r'[.\-_:]', arguments['name']) self.log.debug("Computing keywords from package name %s", name_parts) details['package_name'] = keywords_lookup_text(" ".join(name_parts), keywords_file=keywords_file_name, stopwords_file=stopwords_file_name, **self._LOOKUP_CONF) return details
def run(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('document_id')) postgres = StoragePool.get_connected_storage('PackagePostgres') results = postgres.get_analysis_by_id(arguments['document_id']) return self.do_run(arguments, StoragePool.get_connected_storage('S3PackageData'), postgres, results)
def wrapper(*args, **kwargs): """Wrap sqlalchemy code into try-except.""" try: data = func(*args, **kwargs) result = {"summary": data} except NoResultFound: result = {"error": "No result found."} except SQLAlchemyError: StoragePool.get_connected_storage('BayesianPostgres').session.rollback() result = {"error": "SQLAlchemyError encountered while fetching data. " "Roll-backing. Try again."} return result
def store_error(self, node_args, flow_name, task_name, task_id, exc_info): """Store error to Postgres/RDS so we know about task failures that store data on S3.""" if flow_name == 'bayesianAnalysisFlow': postgres = StoragePool.get_connected_storage('BayesianPostgres') elif flow_name == 'bayesianPackageAnalysisFlow': postgres = StoragePool.get_connected_storage('PackagePostgres') else: raise RuntimeError( "Unable to store error, error storing not defined for flow '%s'" % flow_name) return postgres.store_error(node_args, flow_name, task_name, task_id, exc_info)
def run(self, arguments): """Run task.""" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('document_id')) postgres = StoragePool.get_connected_storage('BayesianPostgres') results = postgres.get_analysis_by_id(arguments['document_id']) return self.do_run(arguments, StoragePool.get_connected_storage('S3Data'), postgres, results)
def run(self, arguments): """Run task.""" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('repo_name')) self._strict_assert(arguments.get('document_id')) s3 = StoragePool.get_connected_storage('S3GitHubManifestMetadata') postgres = StoragePool.get_connected_storage('BayesianPostgres') results = postgres.get_analysis_by_id(arguments['document_id']) for worker_result in results.raw_analyses: # Skip auxiliary tasks (e.g. InitGitHubManifestMetadata) if worker_result.worker[0].isupper(): continue # Retrieve raw manifest file and store it in S3 if worker_result.worker == 'metadata': task_result = worker_result.task_result for detail in task_result.get('details', []): if detail.get('path', None): manifest_url = urllib.parse.urljoin( self.GITHUB_CONTENT_URL, arguments['repo_name'] + '/' + detail['path']) manifest_name = detail['path'].split('/', 1)[1] response = requests.get(manifest_url) if response.status_code == 200: s3.store_raw_manifest(arguments['ecosystem'], arguments['repo_name'], manifest_name, response.content) else: self.log.error( 'Unable to retrieve manifest file from %s', manifest_url) continue result_name = worker_result.worker if result_name.startswith('gh_most_starred_'): result_name = result_name[len('gh_most_starred_'):] version_id = s3.store(arguments, self.flow_name, self.task_name, self.task_id, (result_name, worker_result.task_result)) worker_result.task_result = {'version_id': version_id} try: postgres.session.commit() except SQLAlchemyError: postgres.session.rollback() raise
def execute(self, ecosystem): """Process raw-tags and update existing package_topic.json file on S3. :param ecosystem: Name of ecosystem """ s3 = StoragePool.get_connected_storage('S3CrowdSourceTags') package_topic = [] try: package_topic = s3.retrieve_package_topic(ecosystem) except ClientError: self.log.error("Unable to retrieve package_topic.json for %s", ecosystem) results = {} for record in package_topic: if record.get("ecosystem") == ecosystem and record.get( "package_topic_map"): results = record["package_topic_map"] if not results: self.log.error("Unable to retrieve package_topic_map for %s", ecosystem) results = self._update_tags_from_graph(ecosystem=ecosystem, results=results) s3.store_package_topic(ecosystem, results) self.log.debug( "The file crowd_sourcing_package_topic.json " "has been stored for %s", ecosystem)
def get_component_percentile_rank(ecosystem_backend, package, version, db_session=None): """Get component's percentile rank. :param ecosystem_backend: str, Ecosystem backend from `cucoslib.enums.EcosystemBackend` :param package: str, Package name :param version: str, Package version :param db_session: obj, Database session to use for querying :return: component's percentile rank, or -1 if the information is not available """ try: if not db_session: storage = StoragePool.get_connected_storage("BayesianPostgres") db_session = storage.session rank = db_session.query(ComponentGHUsage.percentile_rank) \ .filter(ComponentGHUsage.name == package) \ .filter(ComponentGHUsage.version == version) \ .filter(ComponentGHUsage.ecosystem_backend == ecosystem_backend) \ .order_by(desc(ComponentGHUsage.timestamp)) \ .first() except SQLAlchemyError: epv = '{e}/{p}/{v}'.format(e=ecosystem_backend, p=package, v=version) logger.exception( 'Unable to retrieve percentile_rank for {epv}'.format(epv=epv)) return -1 if rank is None or len(rank) == 0: return 0 return rank[0]
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] ver = arguments['version'] try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: if not Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres'). session, eco).is_backed_by(EcosystemBackend.maven): self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = self.run_scancode(cache_path) return result_data
def get_analysis_by_id(ecosystem, package, version, analysis_id, db_session=None): """Get result of previously scheduled analysis for given EPV triplet by analysis ID :param ecosystem: str, Ecosystem name :param package: str, Package name :param version: str, Package version :param analysis_id: str, ID of analysis :param db_session: obj, Database session to use for querying :return: analysis result """ if not db_session: storage = StoragePool.get_connected_storage("BayesianPostgres") db_session = storage.session if ecosystem == 'maven': package = MavenCoordinates.normalize_str(package) found = db_session.query(Analysis).\ join(Version).join(Package).join(Ecosystem).\ filter(Ecosystem.name == ecosystem).\ filter(Package.name == package).\ filter(Version.identifier == version).\ filter(Analysis.id == analysis_id).\ one() return found
def _use_maven_index_checker(self): maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') target_dir = os.path.join(maven_index_checker_dir, 'target') s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(target_dir) index_range = '{}-{}'.format(self.count.min, self.count.max) command = [ 'java', '-Xmx768m', '-jar', 'maven-index-checker.jar', '-r', index_range ] with cwd(maven_index_checker_dir): output = TimedCommand.get_command_output(command, is_json=True, graceful=False, timeout=1200) for idx, release in enumerate(output): name = '{}:{}'.format(release['groupId'], release['artifactId']) version = release['version'] self.log.info("Scheduling #%d.", self.count.min + idx) self.analyses_selinon_flow(name, version) # index checker should clean up these dirs in /temp/ after itself, but better be sure for mindexerdir in glob.glob( os.path.join(gettempdir(), 'mindexer-ctxcentral-context*')): rmtree(mindexerdir) self.log.info('Storing pre-built maven index to S3') s3.store_index(target_dir) central_index_dir = os.path.join(target_dir, 'central-index') rmtree(central_index_dir)
def get_dependents_count(ecosystem_backend, package, version, db_session=None): """Get number of GitHub projects dependent on given (package, version). :param ecosystem_backend: str, Ecosystem backend from `f8a_worker.enums.EcosystemBackend` :param package: str, Package name :param version: str, Package version :param db_session: obj, Database session to use for querying :return: number of dependent projects, or -1 if the information is not available """ if not db_session: storage = StoragePool.get_connected_storage("BayesianPostgres") db_session = storage.session try: count = db_session.query(ComponentGHUsage.count) \ .filter(ComponentGHUsage.name == package) \ .filter(ComponentGHUsage.version == version) \ .filter(ComponentGHUsage.ecosystem_backend == ecosystem_backend) \ .order_by(desc(ComponentGHUsage.timestamp)) \ .first() except SQLAlchemyError: db_session.rollback() raise if count: return count[0] return -1
def retrieve_bookkeeping_for_ecosystem(ecosystem): """Retrieve BookKeeping data for given Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved """ rdb = StoragePool.get_connected_storage('BayesianPostgres') db = rdb.session try: e = Ecosystem.by_name(db, ecosystem) package_count = _count( db, db.query(Package).filter(Package.ecosystem == e)) pv_count = _count( db, db.query(Version).join(Package).filter(Package.ecosystem == e)) result = { "summary": { "ecosystem": e.name, "package_count": package_count, "package_version_count": pv_count } } except NoResultFound as e: result = {"error": "No such ecosystem: %s" % ecosystem} except SQLAlchemyError as e: result = { "error": "Error encountered while fetching data. Please check logs." } return result
def get_component_percentile_rank(ecosystem_backend, package, version, db_session=None): """Get component's percentile rank. :param ecosystem_backend: str, Ecosystem backend from `f8a_worker.enums.EcosystemBackend` :param package: str, Package name :param version: str, Package version :param db_session: obj, Database session to use for querying :return: component's percentile rank, or -1 if the information is not available """ if not db_session: storage = StoragePool.get_connected_storage("BayesianPostgres") db_session = storage.session try: rank = db_session.query(ComponentGHUsage.percentile_rank) \ .filter(ComponentGHUsage.name == package) \ .filter(ComponentGHUsage.version == version) \ .filter(ComponentGHUsage.ecosystem_backend == ecosystem_backend) \ .order_by(desc(ComponentGHUsage.timestamp)) \ .first() except SQLAlchemyError: db_session.rollback() raise if rank: return rank[0] return 0
def iter_unknown_dependencies(storage_pool, node_args): """Collect unknown dependencies.""" # Be safe here as fatal errors will cause errors in Dispatcher try: aggregated = storage_pool.get('UnknownDependencyFetcherTask') arguments = [] for element in aggregated["result"]: epv = element.split(':') ecosystem = epv[0] if Ecosystem.by_name( StoragePool.get_connected_storage( 'BayesianPostgres').session, ecosystem).is_backed_by(EcosystemBackend.maven): name = '{}:{}'.format(epv[1], epv[2]) version = epv[3] else: name = epv[1] version = epv[2] analysis_arguments = _create_analysis_arguments( ecosystem, name, version) # TODO: Remove force=True once data-importer is smart enough # to ingest missing packages from s3. analysis_arguments.update({"recursive_limit": 0, "force": True}) arguments.append(analysis_arguments) print('Arguments appended: %s' % ', '.join(str(item) for item in arguments)) logger.info("Arguments for next flows: %s" % str(arguments)) return arguments except Exception as e: logger.exception( "Failed to collect unknown dependencies due to {}".format(e)) return []
def retrieve_bookkeeping_all(): """Retrieve BookKeeping data for all Ecosystems.""" rdb = StoragePool.get_connected_storage('BayesianPostgres') db = rdb.session try: data = [] for e in db.query(Ecosystem).all(): package_count = _count( db, db.query(Package).filter(Package.ecosystem == e)) ecosystem_name = db.query(Ecosystem).get(e.id).name pv_count = _count( db, db.query(Version).join(Package).filter(Package.ecosystem == e)) entry = { "name": ecosystem_name, "package_count": package_count, "package_version_count": pv_count } data.append(entry) result = {"summary": data} except SQLAlchemyError as e: result = { "error": "Error encountered while fetching data. Please check logs." } return result
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) rdb_session = StoragePool.get_connected_storage( 'BayesianPostgres').session name = arguments['name'] ecosystem = arguments['ecosystem'] if ecosystem == 'go': name = quote(name, safe='') project_url = self.configuration.libraries_io_project_url( Ecosystem.by_name(rdb_session, ecosystem), name) project = get_response(project_url) versions = project['versions'] details = { 'dependent_repositories': { 'count': project['dependent_repos_count'] }, 'dependents': { 'count': project['dependents_count'] }, 'releases': { 'count': len(versions), 'recent': self.recent_releases(versions) } } return {'status': 'success', 'summary': [], 'details': details}
def __init__(self, job_id): """Construct the instance of the handler class for given job id.""" self.log = logging.getLogger(__name__) self.job_id = job_id # initialize always as the assumption is that we will use it self._init_celery() self.postgres = StoragePool.get_connected_storage('BayesianPostgres')
def _npm_scan(self, arguments): """ Query Snyk vulndb stored on S3 """ s3 = StoragePool.get_connected_storage('S3Snyk') try: self.log.debug('Retrieving Snyk vulndb from S3') vulndb = s3.retrieve_vulndb() except: self.log.error('Failed to obtain Snyk vulndb database') return {'summary': ['Failed to obtain Snyk vulndb database'], 'status': 'error', 'details': []} entries = [] solver = get_ecosystem_solver(self.storage.get_ecosystem('npm')) for entry in vulndb.get('npm', {}).get(arguments['name'], []): vulnerable_versions = entry['semver']['vulnerable'] affected_versions = solver.solve(["{} {}".format(arguments['name'], vulnerable_versions)], all_versions=True) if arguments['version'] in affected_versions.get(arguments['name'], []): entries.append(self._filter_vulndb_fields(entry)) return {'summary': [e['id'] for e in entries if e], 'status': 'success', 'details': entries}
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('external_request_id')) postgres = StoragePool.get_connected_storage('BayesianPostgres') try: results = postgres.session.query(StackAnalysisRequest)\ .filter(StackAnalysisRequest.id == arguments.get('external_request_id'))\ .first() except SQLAlchemyError: postgres.session.rollback() raise manifests = [] if results is not None: row = results.to_dict() request_json = row.get("requestJson", {}) manifests = request_json.get('manifest', []) return {'manifest': manifests}
def _run_victims_cve_db_cli(self, arguments): """Run Victims CVE DB CLI.""" s3 = StoragePool.get_connected_storage('S3VulnDB') output = [] with TemporaryDirectory() as temp_victims_db_dir: if not s3.retrieve_victims_db_if_exists(temp_victims_db_dir): self.log.debug('No Victims CVE DB found on S3, cloning from github') self.update_victims_cve_db_on_s3() s3.retrieve_victims_db_if_exists(temp_victims_db_dir) try: cli = os.path.join(temp_victims_db_dir, 'victims-cve-db-cli.py') command = [cli, 'search', '--ecosystem', 'java', '--name', arguments['name'], '--version', arguments['version']] output = TimedCommand.get_command_output(command, graceful=False, is_json=True, timeout=60) # 1 minute except TaskError as e: self.log.exception(e) return output
def execute(self, arguments): """Task to mark vulnerable packages in graph. :param arguments: dictionary with task arguments :return: None """ self._strict_assert(arguments.get('ecosystem')) wanted_cves = set(arguments.get('cve_filter', [])) victims_cls = VictimsDB if not wanted_cves else FilteredVictimsDB rdb = StoragePool.get_connected_storage('BayesianPostgres') ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem')) with victims_cls.build_from_git(wanted=wanted_cves) as db: self.log.info('Storing the VictimsDB zip on S3') db.store_on_s3() vulnerable_packages = self.get_vulnerable_packages(db, ecosystem) self.create_in_graph(vulnerable_packages, ecosystem) self.mark_in_graph(vulnerable_packages, ecosystem) self.notify_gemini(vulnerable_packages, ecosystem)
def update_victims_cve_db_on_s3(): """Update Victims CVE DB on S3.""" repo_url = 'https://github.com/victims/victims-cve-db.git' s3 = StoragePool.get_connected_storage('S3VulnDB') with TemporaryDirectory() as temp_dir: Git.clone(repo_url, temp_dir, depth="1") s3.store_victims_db(temp_dir)
def __init__(self, ecosystem, name, version, cache_dir, is_temporary=False): """Initialize all attributes of the EPVCache during object instantiation. :param ecosystem: ecosystem for the given EPV :param name: name for the given EPV :param version: version of the given EPV :param cache_dir: path to dir on the filesystem that should be used for caching artifacts :param is_temporary: if True, then objects of certain age will be automatically deleted from the underlying storage """ self.ecosystem = ecosystem self.name = name self.version = version self.cache_dir = cache_dir self._eco_obj = None storage_name = 'S3Artifacts' if not is_temporary else 'S3TempArtifacts' self._s3 = StoragePool.get_connected_storage(storage_name) self._postgres = StoragePool.get_connected_storage('BayesianPostgres') self._base_object_key = "{ecosystem}/{name}/{version}".format( ecosystem=ecosystem, name=name, version=version) self._extracted_tarball_dir = os.path.join( self.cache_dir, self._EXTRACTED_SOURCE_TARBALL_DIR) self._extracted_source_jar_dir = os.path.join( self.cache_dir, self._EXTRACTED_SOURCE_JAR_DIR) self._pom_xml_path = os.path.join(self.cache_dir, self._POM_XML_NAME) self._source_jar_path = os.path.join(self.cache_dir, self._SOURCE_JAR_NAME) self._pom_xml_object_key = "{}/{}".format(self._base_object_key, self._POM_XML_NAME) self._source_jar_object_key = "{}/{}".format(self._base_object_key, self._SOURCE_JAR_NAME) # Based on actual tarball name which can vary based on ecosystem - see meta.json self._source_tarball_path = None self._source_tarball_object_key = None # Meta-information about artifact self._meta = None self._meta_json_object_key = "{}/{}".format(self._base_object_key, self._META_JSON_NAME)
def _get_package_level_keywords(self, ecosystem, name): """Retrieve all package level keywords for the given package.""" package_postgres = StoragePool.get_connected_storage('PackagePostgres') self.log.debug( "Retrieving results of 'keywords_tagging' on package level") task_result = package_postgres.get_latest_task_result( ecosystem, name, 'keywords_tagging') return task_result.get('details', {}) if task_result else {}
def execute(self, repositories, ecosystem, bucket_name, object_key): """ Aggregate package names from GitHub manifests. :param repositories: a list of repositories :param ecosystem: ecosystem, will appear in the resulting JSON file :param bucket_name: name of the bucket where to put the resulting JSON file :param object_key: object key of the resulting JSON file """ s3 = StoragePool.get_connected_storage('S3GitHubManifestMetadata') package_list = [] tagger_list = [] for repo in repositories: try: repo_ecosystem = repo['ecosystem'] repo_name = repo['repo_name'] except ValueError: self.log.error('Invalid configuration, skipping: {config}'.format( config=str(repo))) continue try: obj = '{e}/{repo_name}/dependency_snapshot.json'.format( e=repo_ecosystem, repo_name=repo_name.replace('/', ':')) dependency_snapshot = s3.retrieve_dict(obj) dependencies = dependency_snapshot.get('details', {}).get('runtime', []) packages = list({x.get('name') for x in dependencies}) if packages: package_list.append(packages) packages_version = dict([(x.get("name"), x.get("version")) for x in dependencies]) if packages_version: extracted_tagger_list = self._create_tagger_list(ecosystem, packages_version) for etl in extracted_tagger_list: tagger_list.append(etl) except Exception as e: self.log.error('Unable to collect dependencies for {repo_name}: {reason}'.format( repo_name=repo_name, reason=str(e))) continue results = { 'ecosystem': ecosystem, 'package_list': package_list } self.log.info("Storing aggregated list of packages in S3") s3_dest = AmazonS3(bucket_name=bucket_name) s3_dest.connect() s3_dest.store_dict(results, object_key) s3_dest.store_dict(tagger_list, "tagger_list" + object_key)
def _create_analysis_arguments(ecosystem, name, version): """Create arguments for analysis.""" return { 'ecosystem': ecosystem, 'name': MavenCoordinates.normalize_str(name) if Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres').session, ecosystem).is_backed_by( EcosystemBackend.maven) else name, 'version': version }
def execute(self, arguments): """ :param arguments: optional argument 'only_already_scanned' to run only on already analysed packages :return: EPV dict describing which packages should be analysed """ only_already_scanned = arguments.pop('only_already_scanned', True) if arguments else True ignore_modification_time = arguments.pop('ignore_modification_time', False) if arguments else False self._strict_assert(not arguments) s3 = StoragePool.get_connected_storage('S3OWASPDepCheck') with tempdir() as temp_data_dir: s3.retrieve_depcheck_db_if_exists(temp_data_dir) self._update_dep_check_db(temp_data_dir) s3.store_depcheck_db(temp_data_dir) cve_db = self._get_snyk_vulndb() s3 = StoragePool.get_connected_storage('S3Snyk') s3.store_vulndb(cve_db) last_sync_datetime = s3.update_sync_date() to_update = [] for package_name, cve_records in cve_db.get('npm', {}).items(): for record in cve_records: modification_time = datetime_parser.parse( record['modificationTime']) if ignore_modification_time or modification_time >= last_sync_datetime: affected_versions = self._get_versions_to_scan( package_name, record['semver']['vulnerable'], only_already_scanned) for version in affected_versions: to_update.append({ 'ecosystem': 'npm', 'name': package_name, 'version': version }) return {'modified': to_update}