def _strict_assert(cls, assert_cond): """Assert on condition. If condition is False, fatal error is raised so task is not retried. """ if not assert_cond: raise FatalTaskError("Strict assert failed in task '%s'" % cls.__name__)
def _python_scan(self, arguments): """Run OWASP dependency-check experimental analyzer for Python artifacts. https://jeremylong.github.io/DependencyCheck/analyzers/python.html """ extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() # depcheck needs to be pointed to a specific file, we can't just scan whole directory egg_info = pkg_info = metadata = None for root, _, files in os.walk(extracted_tarball): if root.endswith('.egg-info') or root.endswith('.dist-info'): egg_info = root if 'PKG-INFO' in files: pkg_info = os.path.join(root, 'PKG-INFO') if 'METADATA' in files: metadata = os.path.join(root, 'METADATA') scan_path = egg_info or pkg_info or metadata if pkg_info and not egg_info: # Work-around for dependency-check ignoring PKG-INFO outside .dist-info/ # https://github.com/jeremylong/DependencyCheck/issues/896 egg_info_dir = os.path.join(extracted_tarball, arguments['name'] + '.egg-info') try: os.mkdir(egg_info_dir) copy(pkg_info, egg_info_dir) scan_path = egg_info_dir except os.error: self.log.warning('Failed to copy %s to %s', pkg_info, egg_info_dir) if not scan_path: raise FatalTaskError('File types not supported by OWASP dependency-check') return self._run_owasp_dep_check(scan_path, experimental=True)
def execute(self, arguments): """Run oscryptocatcher tool for matching crypto algorithms.""" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() results = {'status': 'unknown', 'summary': {}, 'details': []} try: oscc = TimedCommand.get_command_output( ['oscryptocatcher', '--subdir-in-result', cache_path], graceful=False, is_json=True) self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc) results['details'] = oscc['details'] results['summary'] = oscc['summary'] results['status'] = 'success' except Exception: raise FatalTaskError('oscryptocatcher failed') return results
def _scrape_page(url): """Web scrape URL.""" response = requests.get(url) if response.status_code != 200: raise FatalTaskError("Unable to access package web page at '%s'" % url) return BeautifulSoup(response.text, 'lxml')
def _handle_external_deps(ecosystem, deps): """Resolve external dependency specifications.""" if not ecosystem or not deps: return [] solver = get_ecosystem_solver(ecosystem) try: versions = solver.solve(deps) except Exception as exc: raise FatalTaskError("Dependencies could not be resolved: '{}'" .format(deps)) from exc return [{"package": k, "version": v} for k, v in versions.items()]
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) arguments.pop('document_id', None) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) url = self.get_upstream_url(arguments) upstream = self.get_upstream_entry(package, url) if upstream is None: upstream = self.add_or_update_upstream(package, url) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same # time as there is no lock, but let's say it's OK if upstream.updated_at is not None \ and datetime.datetime.utcnow() - upstream.updated_at < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - ' 'last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis( package_id=package.id, started_at=datetime.datetime.utcnow(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.utcnow() db.commit() arguments['document_id'] = package_analysis.id return arguments
def collect_npm(self, name): """Collect plain text description from npmjs.com for the given package. :param name: package name for which the plain text description should be gathered :return: plain text description """ url = self._NPM_PACKAGE_URL.format(package=name) content = self._scrape_page(url).find(class_='content-column') if not content: raise FatalTaskError("No content was found at '%s' for NPM package '%s'", name) return content.text
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) collector = self._COLLECTOR_HANDLERS.get(arguments['ecosystem']) if not collector: raise FatalTaskError( "No repository description collector registered for ecosystem '%s'" % arguments['ecosystem']) # TODO: we should probably do some additional post-processing later return collector(self, arguments['name'])
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(isinstance(arguments.get('ecosystem'), str)) self._strict_assert(isinstance(arguments.get('name'), str)) self._strict_assert(isinstance(arguments.get('version'), str)) if arguments['ecosystem'] not in _SUPPORTED_ECOSYSTEMS: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) return arguments
def collect_npm(self, name): """Collect plain text description from npmjs.com for the given package. :param name: package name for which the plain text description should be gathered :return: plain text description """ url = self._NPM_PACKAGE_URL.format(package=name) content = self._scrape_page(url).body if not content: raise FatalTaskError("No content was found at '%s' for NPM package '%s'", name) # rip out all script and style elements for script in content(["script", "style"]): script.extract() return content.text
def run_gofedlib(self, topdir, timeout): """Run gofedlib-cli to extract dependencies from golang sources.""" tc = TimedCommand([ 'gofedlib-cli', '--dependencies-main', '--dependencies-packages', '--dependencies-test', '--skip-errors', topdir ]) status, data, err = tc.run(timeout=timeout) if status: raise FatalTaskError('gofedlib-cli failed: {err}'.format(err=err)) result = json.loads(data[0]) main_deps_count = len(result.get('deps-main', [])) packages_count = len(result.get('deps-packages', [])) self.log.debug('gofedlib found %i dependencies', main_deps_count + packages_count) return [{'ecosystem': 'gofedlib', 'result': result}]
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) result_data = {'status': 'unknown', 'summary': [], 'details': []} source_tarball_path = ObjectCache.get_from_dict( arguments).get_source_tarball() sa = StaticAnalysis(source_tarball_path) try: analysis_result = sa.analyze() # make output reproducible - scanning the same # input multiple times should always produce # the same output del analysis_result["scan"]["time-created"] del analysis_result["scan"]["time-finished"] del analysis_result["scan"]["host"] del analysis_result["scan"]["store-results-to"] stats = {} for defect in analysis_result["defects"]: stats.setdefault(defect["checker"], {"count": 0}) stats[defect["checker"]]["count"] += 1 try: stats[defect["checker"]]["cwe"] = defect["cwe"] except KeyError: pass result_data['summary'] = stats result_data['status'] = 'success' result_data['details'] = analysis_result except Exception as ex: msg = "static analysis was not successful: %r" % ex self.log.error(msg) raise FatalTaskError(msg) from ex return result_data
def collect_pypi(self, name): """Collect plain text description from PyPI for the given package. :param name: package name for which the plain text description should be gathered :return: plain text description """ url = self._PYPI_PACKAGE_URL.format(package=name) content = self._scrape_page(url).find(class_='project-description') if not content: raise FatalTaskError("No content was found at '%s' for PyPI package '%s'", name) # Remove content that is automatically added by PyPI - this content is # on the bottom and keeps info extracted from setup.py. We already keep # this data, so remove duplicity in fact. nodot = content.find(class_='nodot') if nodot: nodot.decompose() return content.text
def run(self, node_args): """To be transparently called by Selinon. Selinon transparently calls run(), which takes care of task audit and some additional checks and calls execute(). """ # SQS guarantees 'deliver at least once', so there could be multiple # messages of a type, give up immediately if self.storage and isinstance(self.storage, (BayesianPostgres, PackagePostgres)): if self.storage.get_worker_id_count(self.task_id) > 0: raise FatalTaskError( "Task with ID '%s' was already processed" % self.task_id) start = datetime.utcnow() try: result = self.execute(node_args) finally: # remove all files that were downloaded for this task ObjectCache.wipe() end = datetime.utcnow() if result: # Ensure result complies with the defined schema (if any) before saving self.validate_result(result) if result is None: # Keep track of None results and add _audit and _release keys result = {} if self.add_audit_info: # `_audit` key is added to every analysis info submitted result['_audit'] = { 'started_at': json_serial(start), 'ended_at': json_serial(end), 'version': 'v1' } ecosystem_name = node_args.get('ecosystem') result['_release'] = '{}:{}:{}'.format(ecosystem_name, node_args.get('name'), node_args.get('version')) return result
def run_scancode(scan_path): """Run scancode tool.""" result_data = {'status': 'unknown', 'summary': {}, 'details': {}} command = [ path.join(configuration.SCANCODE_PATH, 'scancode'), # Scan for licenses '--license', # Do not return license matches with scores lower than this score '--license-score', configuration.SCANCODE_LICENSE_SCORE, # Files without findings are omitted '--only-findings', # Use n parallel processes '--processes', configuration.SCANCODE_PROCESSES, # Do not print summary or progress messages '--quiet', # Strip the root directory segment of all paths '--strip-root', # Stop scanning a file if scanning takes longer than a timeout in seconds '--timeout', configuration.SCANCODE_TIMEOUT, scan_path ] for ignore_pattern in configuration.SCANCODE_IGNORE: command += ['--ignore', '{}'.format(ignore_pattern)] with username(): tc = TimedCommand(command) status, output, error = tc.run(is_json=True, timeout=1200) if status != 0: raise FatalTaskError( "Error (%s) during running command %s: %r" % (str(status), command, error)) details = LicenseCheckTask.process_output(output) result_data['details'] = details result_data['status'] = 'success' result_data['summary'] = { 'sure_licenses': list(details['licenses'].keys()) } return result_data
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(isinstance(arguments.get('ecosystem'), str)) self._strict_assert(isinstance(arguments.get('name'), str)) if arguments['ecosystem'] not in _SUPPORTED_ECOSYSTEMS: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) # Don't ingest for private packages if not is_pkg_public(arguments['ecosystem'], arguments['name']): logger.info("Private package ingestion ignored %s %s", arguments['ecosystem'], arguments['name']) raise NotABugFatalTaskError("Private package alert {} {}".format( arguments['ecosystem'], arguments['name'])) return arguments
def validate_result(self, result): """Ensure that results comply with the task schema, if defined. Tasks define a schema by setting schema_ref appropriately. Schemas are retrieved from workers/schemas/generated via pkgutil. """ # Skip validation if no schema is defined schema_ref = self.schema_ref if schema_ref is None: return # Load schema if not yet loaded schema = self._schema if schema is None: schema = self._schema = load_worker_schema(schema_ref) # Validate result against schema try: jsonschema.validate(result, schema) except jsonschema.exceptions.ValidationError as e: raise FatalTaskError( 'Schema validation failed: {e}'.format(e=str(e))) # Record the validated schema details set_schema_ref(result, schema_ref)
def run_gofedlib(self, topdir, name, version, timeout): """Run gofedlib-cli to extract dependencies from golang sources.""" tc = TimedCommand([ 'gofedlib-cli', '--dependencies-main', '--dependencies-packages', '--dependencies-test', '--skip-errors', topdir ]) status, data, err = tc.run(timeout=timeout) if status: raise FatalTaskError('gofedlib-cli failed: {err}'.format(err=err)) result = json.loads(data[0]) main_deps_count = len(result.get('deps-main', [])) packages_count = len(result.get('deps-packages', [])) self.log.debug('gofedlib found %i dependencies', main_deps_count + packages_count) result['code_repository'] = { 'type': 'git', 'url': 'https://{name}'.format(name=name) } result['name'] = name result['version'] = version return [{'ecosystem': 'gofedlib', 'result': result}]
def run_mercator(self, arguments, cache_path, keep_path=False, outermost_only=True, timeout=300, resolve_poms=True): result_data = {'status': 'unknown', 'summary': [], 'details': []} mercator_target = arguments.get('cache_sources_path', cache_path) if arguments['ecosystem'] == 'go': # no Go support in Mercator-go yet, we handle it separately here tc = TimedCommand([ 'gofedlib-cli', '--dependencies-main', '--dependencies-packages', '--dependencies-test', '--skip-errors', mercator_target ]) status, data, err = tc.run(timeout=timeout) else: tc = TimedCommand(['mercator', mercator_target]) update_env = { 'MERCATOR_JAVA_RESOLVE_POMS': 'true' } if resolve_poms else {} status, data, err = tc.run(timeout=timeout, is_json=True, update_env=update_env) if status != 0: self.log.error(err) raise FatalTaskError(err) ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem']) if ecosystem_object.is_backed_by(EcosystemBackend.pypi): # TODO: attempt static setup.py parsing with mercator items = [self._merge_python_items(mercator_target, data)] elif arguments['ecosystem'] == 'go': result = {'result': json.loads(data[0])} # data normalized expects this result['ecosystem'] = 'gofedlib' # we only support git now result['result']['code_repository'] = { 'type': 'git', 'url': 'https://{name}'.format(name=arguments.get('name')) } result['result']['name'] = arguments.get('name') result['result']['version'] = arguments.get('version') items = [result] main_deps_count = len(result['result'].get('deps-main', [])) packages_count = len(result['result'].get('deps-packages', [])) self.log.debug('gofedlib found %i dependencies', main_deps_count + packages_count) else: if outermost_only: # process only root level manifests (or the ones closest to the root level) items = self._data_normalizer.get_outermost_items( data.get('items') or []) else: items = data.get('items') or [] self.log.debug('mercator found %i projects, outermost %i', len(data), len(items)) if ecosystem_object.is_backed_by(EcosystemBackend.maven): # for maven we download both Jar and POM, we consider POM to be *the* # source of information and don't want to duplicate info by including # data from pom included in artifact (assuming it's included) items = [ d for d in items if d['ecosystem'].lower() == 'java-pom' ] result_data['details'] = [ self._data_normalizer.handle_data(d, keep_path=keep_path) for d in items ] result_data['status'] = 'success' return result_data
def _run_owasp_dep_check(self, scan_path, experimental=False): """Run OWASP Dependency-Check.""" def _clean_dep_check_tmp(): for dcdir in glob(os.path.join(gettempdir(), 'dctemp*')): rmtree(dcdir) s3 = StoragePool.get_connected_storage('S3VulnDB') depcheck = configuration.dependency_check_script_path with TemporaryDirectory() as temp_data_dir: if not s3.retrieve_depcheck_db_if_exists(temp_data_dir): self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...') self.update_depcheck_db_on_s3() s3.retrieve_depcheck_db_if_exists(temp_data_dir) report_path = os.path.join(temp_data_dir, 'report.xml') command = [depcheck, '--noupdate', '--format', 'XML', '--project', 'CVEcheckerTask', '--data', temp_data_dir, '--scan', scan_path, '--out', report_path] if experimental: command.extend(['--enableExperimental']) for suppress_xml in glob(os.path.join(os.environ['OWASP_DEP_CHECK_SUPPRESS_PATH'], '*.xml')): command.extend(['--suppress', suppress_xml]) output = [] old_java_opts = os.getenv('JAVA_OPTS', '') try: self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' % scan_path) os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit output = TimedCommand.get_command_output(command, graceful=False, timeout=600) # 10 minutes with open(report_path) as r: report_dict = anymarkup.parse(r.read()) except (TaskError, FileNotFoundError) as e: _clean_dep_check_tmp() for line in output: self.log.warning(line) self.log.exception(str(e)) raise FatalTaskError('OWASP Dependency-Check scan failed') from e finally: os.environ['JAVA_OPTS'] = old_java_opts _clean_dep_check_tmp() results = [] dependencies = report_dict.get('analysis', {}).get('dependencies') # value can be None dependencies = dependencies.get('dependency', []) if dependencies else [] if not isinstance(dependencies, list): dependencies = [dependencies] for dependency in dependencies: vulnerabilities = dependency.get('vulnerabilities') # value can be None vulnerabilities = vulnerabilities.get('vulnerability', []) if vulnerabilities else [] if not isinstance(vulnerabilities, list): vulnerabilities = [vulnerabilities] for vulnerability in vulnerabilities: av = vulnerability.get('cvssAccessVector') av = av[0] if av else '?' ac = vulnerability.get('cvssAccessComplexity') ac = ac[0] if ac else '?' au = vulnerability.get('cvssAuthenticationr') au = au[0] if au else '?' c = vulnerability.get('cvssConfidentialImpact') c = c[0] if c else '?' i = vulnerability.get('cvssIntegrityImpact') i = i[0] if i else '?' a = vulnerability.get('cvssAvailabilityImpact') a = a[0] if a else '?' vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{Integrity}/A:{A}".\ format(AV=av, AC=ac, Au=au, C=c, Integrity=i, A=a) result = { 'cvss': { 'score': vulnerability.get('cvssScore'), 'vector': vector } } references = vulnerability.get('references', {}).get('reference', []) if not isinstance(references, list): references = [references] result['references'] = [r.get('url') for r in references] for field in ['severity', 'description']: result[field] = vulnerability.get(field) result['id'] = vulnerability.get('name') results.append(result) return {'summary': [r['id'] for r in results], 'status': 'success', 'details': results}
def execute(self, arguments): self._strict_assert(arguments.get('manifest')) self._strict_assert(arguments.get('user_profile')) user_profile = arguments['user_profile'] self.store_in_bucket(user_profile) # If we receive a manifest file we need to save it first result = [] for manifest in arguments['manifest']: temp_path = mkdtemp() with open(os.path.join(temp_path, manifest['filename']), 'a+') as fd: fd.write(manifest['content']) # mercator-go does not work if there is no package.json if 'shrinkwrap' in manifest['filename'].lower(): with open(os.path.join(temp_path, 'package.json'), 'w') as f: f.write(json.dumps({})) # TODO: this is a workaround since stack analysis is not handled by dispatcher, so we create instance manually for now subtask = MercatorTask(None, None, None, None, None) # since we're creating MercatorTask dynamically in code, we need to make sure # that it has storage; storage is assigned to tasks dynamically based on task_name subtask.task_name = self.task_name arguments['ecosystem'] = manifest['ecosystem'] out = subtask.run_mercator(arguments, temp_path) if temp_path: rmtree(temp_path, ignore_errors=True) if not out["details"]: raise FatalTaskError( "No metadata found processing manifest file '{}'".format( manifest['filename'])) out["details"][0]['manifest_file'] = manifest['filename'] out["details"][0]['ecosystem'] = manifest['ecosystem'] # If we're handling an external request we need to convert dependency specifications to # concrete versions that we can query later on in the `AggregatorTask` manifest_descriptor = get_manifest_descriptor_by_filename( manifest['filename']) if 'external_request_id' in arguments: if manifest_descriptor.has_resolved_deps: # npm-shrinkwrap.json, pom.xml, requirements.txt if "_dependency_tree_lock" in out["details"][ 0]: # npm-shrinkwrap.json, requirements.txt manifest_dependencies = out["details"][0][ "_dependency_tree_lock"]["dependencies"] else: # pom.xml manifest_dependencies = out["details"][0][ "dependencies"] if manifest_descriptor.has_recursive_deps: # npm-shrinkwrap.json def _flatten(deps, collect): for dep in deps: collect.append({ 'package': dep['name'], 'version': dep['version'] }) _flatten(dep['dependencies'], collect) resolved_deps = [] _flatten(manifest_dependencies, resolved_deps) else: # pom.xml, requirements.txt resolved_deps =\ [{'package': x.split(' ')[0], 'version': x.split(' ')[1]} for x in manifest_dependencies] else: # package.json resolved_deps = self._handle_external_deps( self.storage.get_ecosystem(arguments['ecosystem']), out["details"][0]["dependencies"]) out["details"][0]['_resolved'] = resolved_deps result.append(out) return {'result': result, 'user_profile': user_profile}
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(isinstance(arguments.get('ecosystem'), str)) self._strict_assert(isinstance(arguments.get('name'), str)) self._strict_assert(isinstance(arguments.get('version'), str)) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) # make sure we store package name in its normalized form arguments['name'] = normalize_package_name(ecosystem.backend.name, arguments['name']) if len(pattern_ignore.findall(arguments['version'])) > 0: self.log.info("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) raise NotABugFatalTaskError("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) # Dont try ingestion for private packages if is_pkg_public(arguments['ecosystem'], arguments['name']): self.log.info("Ingestion flow for {} {}".format( arguments['ecosystem'], arguments['name'])) else: self.log.info("Private package ingestion ignored {} {}".format( arguments['ecosystem'], arguments['name'])) raise NotABugFatalTaskError("Private package alert {} {}".format( arguments['ecosystem'], arguments['name'])) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): if db.query(Analysis).filter( Analysis.version_id == v.id).count() > 0: arguments['analysis_already_exists'] = True self.log.debug( "Arguments returned by initAnalysisFlow without force: {}". format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) npm_dir = self.configuration.NPM_DATA_DIR try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar( cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}' .format(n=arguments.get('name'), v=arguments.get('version'), err=str(e))) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml( cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) if arguments['ecosystem'] == "npm": shutil.rmtree(npm_dir, True) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug( "Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('data')) self._strict_assert(arguments.get('external_request_id')) db = self.storage.session try: results = db.query(StackAnalysisRequest)\ .filter(StackAnalysisRequest.id == arguments.get('external_request_id'))\ .first() except SQLAlchemyError: db.rollback() raise manifests = [] if results is not None: row = results.to_dict() request_json = row.get("requestJson", {}) manifests = request_json.get('manifest', []) # If we receive a manifest file we need to save it first result = [] for manifest in manifests: with TemporaryDirectory() as temp_path: with open(os.path.join(temp_path, manifest['filename']), 'a+') as fd: fd.write(manifest['content']) # mercator-go does not work if there is no package.json if 'shrinkwrap' in manifest['filename'].lower(): with open(os.path.join(temp_path, 'package.json'), 'w') as f: f.write(json.dumps({})) # Create instance manually since stack analysis is not handled by dispatcher subtask = MercatorTask.create_test_instance( task_name=self.task_name) arguments['ecosystem'] = manifest['ecosystem'] out = subtask.run_mercator(arguments, temp_path) if not out["details"]: raise FatalTaskError( "No metadata found processing manifest file '{}'".format( manifest['filename'])) if 'dependencies' not in out['details'][0] and out.get( 'status', None) == 'success': raise FatalTaskError( "Dependencies could not be resolved from manifest file '{}'" .format(manifest['filename'])) out["details"][0]['manifest_file'] = manifest['filename'] out["details"][0]['ecosystem'] = manifest['ecosystem'] out["details"][0]['manifest_file_path'] = manifest.get( 'filepath', 'File path not available') # If we're handling an external request we need to convert dependency specifications to # concrete versions that we can query later on in the `AggregatorTask` manifest_descriptor = get_manifest_descriptor_by_filename( manifest['filename']) if 'external_request_id' in arguments: manifest_dependencies = [] if manifest_descriptor.has_resolved_deps: # npm-shrinkwrap.json, pom.xml if "_dependency_tree_lock" in out["details"][ 0]: # npm-shrinkwrap.json if 'dependencies' in out['details'][0][ "_dependency_tree_lock"]: manifest_dependencies = out["details"][0][ "_dependency_tree_lock"].get( "dependencies", []) else: # pom.xml if 'dependencies' in out['details'][0]: manifest_dependencies = out["details"][0].get( "dependencies", []) if manifest_descriptor.has_recursive_deps: # npm-shrinkwrap.json def _flatten(deps, collect): for dep in deps: collect.append({ 'package': dep['name'], 'version': dep['version'] }) _flatten(dep['dependencies'], collect) resolved_deps = [] _flatten(manifest_dependencies, resolved_deps) else: # pom.xml resolved_deps =\ [{'package': x.split(' ')[0], 'version': x.split(' ')[1]} for x in manifest_dependencies] else: # package.json, requirements.txt resolved_deps = self._handle_external_deps( self.storage.get_ecosystem(arguments['ecosystem']), out["details"][0]["dependencies"]) out["details"][0]['_resolved'] = resolved_deps result.append(out) return {'result': result}
def execute(self, arguments, db, manifests, source=None): """Dependency finder logic.""" # TODO: reduce cyclomatic complexity # If we receive a manifest file we need to save it first result = [] for manifest in manifests: content_hash = None if source == 'osio': content_hash = generate_content_hash(manifest['content']) current_app.logger.info("{} file digest is {}".format(manifest['filename'], content_hash)) s3 = AmazonS3(bucket_name='boosters-manifest') try: s3.connect() manifest['content'] = s3.retrieve_blob(content_hash).decode('utf-8') except ClientError as e: current_app.logger.error("Unexpected error while retrieving S3 data: %s" % e) raise with TemporaryDirectory() as temp_path: with open(os.path.join(temp_path, manifest['filename']), 'a+') as fd: fd.write(manifest['content']) # mercator-go does not work if there is no package.json if 'shrinkwrap' in manifest['filename'].lower(): with open(os.path.join(temp_path, 'package.json'), 'w') as f: f.write(json.dumps({})) # Create instance manually since stack analysis is not handled by dispatcher subtask = MercatorTask.create_test_instance(task_name='metadata') arguments['ecosystem'] = manifest['ecosystem'] out = subtask.run_mercator(arguments, temp_path, resolve_poms=False) if not out["details"]: raise FatalTaskError("No metadata found processing manifest file '{}'" .format(manifest['filename'])) if 'dependencies' not in out['details'][0] and out.get('status', None) == 'success': raise FatalTaskError("Dependencies could not be resolved from manifest file '{}'" .format(manifest['filename'])) out["details"][0]['manifest_file'] = manifest['filename'] out["details"][0]['ecosystem'] = manifest['ecosystem'] out["details"][0]['manifest_file_path'] = manifest.get('filepath', 'File path not available') # If we're handling an external request we need to convert dependency specifications to # concrete versions that we can query later on in the `AggregatorTask` manifest_descriptor = get_manifest_descriptor_by_filename(manifest['filename']) if 'external_request_id' in arguments: manifest_dependencies = [] if manifest_descriptor.has_resolved_deps: # npm-shrinkwrap.json, pom.xml if "_dependency_tree_lock" in out["details"][0]: # npm-shrinkwrap.json if 'dependencies' in out['details'][0]["_dependency_tree_lock"]: manifest_dependencies = out["details"][0]["_dependency_tree_lock"].get( "dependencies", []) else: # pom.xml if 'dependencies' in out['details'][0]: manifest_dependencies = out["details"][0].get("dependencies", []) if manifest_descriptor.has_recursive_deps: # npm-shrinkwrap.json def _flatten(deps, collect): for dep in deps: collect.append({'package': dep['name'], 'version': dep['version']}) _flatten(dep['dependencies'], collect) resolved_deps = [] _flatten(manifest_dependencies, resolved_deps) else: # pom.xml resolved_deps =\ [{'package': x.split(' ')[0], 'version': x.split(' ')[1]} for x in manifest_dependencies] else: # package.json, requirements.txt try: resolved_deps = self._handle_external_deps( Ecosystem.by_name(db, arguments['ecosystem']), out["details"][0]["dependencies"]) except Exception: raise out["details"][0]['_resolved'] = resolved_deps result.append(out) return {'result': result}
def _strict_assert(self, assert_cond): if not assert_cond: raise FatalTaskError("Strict assert failed.")
def run_mercator(self, arguments, cache_path, keep_path=False, outermost_only=True, timeout=300, resolve_poms=True): """Run mercator tool.""" # TODO: reduce cyclomatic complexity result_data = {'status': 'unknown', 'summary': [], 'details': []} mercator_target = arguments.get('cache_sources_path', cache_path) tc = TimedCommand(['mercator', mercator_target]) update_env = { 'MERCATOR_JAVA_RESOLVE_POMS': 'true' } if resolve_poms else {} status, data, err = tc.run(timeout=timeout, is_json=True, update_env=update_env) if status != 0: self.log.error(err) raise FatalTaskError(err) ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem']) if ecosystem_object.is_backed_by(EcosystemBackend.pypi): # TODO: attempt static setup.py parsing with mercator items = [self._merge_python_items(mercator_target, data)] if items == [None]: raise NotABugFatalTaskError( 'Found no usable PKG-INFO/metadata.json/requirements.txt') else: if outermost_only: # process only root level manifests (or the ones closest to the root level) items = self._data_normalizer.get_outermost_items( data.get('items') or []) else: items = data.get('items') or [] self.log.debug('mercator found %i projects, outermost %i', len(data), len(items)) if ecosystem_object.is_backed_by(EcosystemBackend.maven): # for maven we download both Jar and POM, we consider POM to be *the* # source of information and don't want to duplicate info by including # data from pom included in artifact (assuming it's included) items = [ d for d in items if d['ecosystem'].lower() == 'java-pom' ] elif ecosystem_object.is_backed_by(EcosystemBackend.npm): # ignore other metadata files, e.g. requirements.txt items = [d for d in items if d['ecosystem'].lower() == 'npm'] elif arguments['ecosystem'] == 'go': items = [ d for d in items if d['ecosystem'].lower() == 'go-glide' ] if not items: # Mercator found no Go Glide files, run gofedlib items = self.run_gofedlib(topdir=mercator_target, name=arguments.get('name'), version=arguments.get('version'), timeout=timeout) result_data['details'] = [ self._data_normalizer.handle_data(d, keep_path=keep_path) for d in items ] result_data['status'] = 'success' return result_data
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('ecosystem')) # make sure we store package name based on ecosystem package naming case sensitivity arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name']) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): # TODO: this is OK for now, but if we will scale and there will be # 2+ workers running this task they can potentially schedule two # flows of a same type at the same time if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0: # we need to propagate flags that were passed to flow, but not # E/P/V - this way we are sure that for example graph import is # scheduled (arguments['force_graph_sync'] == True) arguments.pop('name') arguments.pop('version') arguments.pop('ecosystem') self.log.debug("Arguments returned by initAnalysisFlow without force: {}" .format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path ) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar(cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'. format(n=arguments.get('name'), v=arguments.get('version'), err=str(e)) ) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments