class LibrariesIoTask(BaseTask): """Collects statistics from Libraries.io.""" _analysis_name = "libraries_io" schema_ref = SchemaRef(_analysis_name, '2-0-0') @staticmethod def recent_releases(versions, count=10): """Sort versions by 'published_at' and return 'count' latest.""" return sorted(versions, key=itemgetter('published_at'))[-count:] def execute(self, arguments): """Task entrypoint.""" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) name = arguments['name'] ecosystem = arguments['ecosystem'] if ecosystem == 'go': name = quote(name, safe='') project_url = self.configuration.libraries_io_project_url(ecosystem, name) project = get_response(project_url) versions = project['versions'] details = {'dependent_repositories': {'count': project['dependent_repos_count']}, 'dependents': {'count': project['dependents_count']}, 'releases': {'count': len(versions), 'recent': self.recent_releases(versions) } } return {'status': 'success', 'summary': [], 'details': details}
class OSCryptoCatcherTask(BaseTask): """ Runs oscryptocatcher tool for matching crypto algorithms """ _analysis_name = 'crypto_algorithms' schema_ref = SchemaRef(_analysis_name, '1-0-0') def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() results = {'status': 'unknown', 'summary': {}, 'details': []} try: oscc = TimedCommand.get_command_output( ['oscryptocatcher', '--subdir-in-result', cache_path], graceful=False, is_json=True) self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc) results['details'] = oscc['details'] results['summary'] = oscc['summary'] results['status'] = 'success' except Exception: results['status'] = 'error' return results
class ComponentAnalyses(ResourceWithSchema): method_decorators = [login_required] schema_ref = SchemaRef('analyses_graphdb', '1-2-0') @staticmethod def get(ecosystem, package, version): decoded = decode_token() if ecosystem == 'maven': package = MavenCoordinates.normalize_str(package) package = case_sensitivity_transform(ecosystem, package) result = get_analyses_from_graph(ecosystem, package, version) if result is not None: # Known component for Bayesian server_create_component_bookkeeping(ecosystem, package, version, decoded) return result if os.environ.get("INVOKE_API_WORKERS", "") == "1": # Enter the unknown path server_create_analysis(ecosystem, package, version, user_profile=decoded, api_flow=True, force=False, force_graph_sync=True) msg = "Package {ecosystem}/{package}/{version} is unavailable. The package will be available shortly,"\ " please retry after some time.".format(ecosystem=ecosystem, package=package, version=version) raise HTTPError(202, msg) else: server_create_analysis(ecosystem, package, version, user_profile=decoded, api_flow=False, force=False, force_graph_sync=True) msg = "No data found for {ecosystem} Package {package}/{version}".format(ecosystem=ecosystem, package=package, version=version) raise HTTPError(404, msg)
class BlackDuckRelease(object): """ Release object consist of version string, unique identifier and `datetime.datetime` information when this particular version was released """ @schema.input(SchemaRef("blackduck-release", "1-0-0")) def __init__(self, json_data, project): self._version = json_data['version'] self._id = json_data['versionId'] self._released_at = datetime.strptime(json_data['releasedOn'], "%Y-%m-%dT%H:%M:%S.%fZ") self._project = project @property def project(self): return self._project @property def version(self): """ Release version """ return self._version @property def id(self): """ Unique identifier """ return self._id @property def released_at(self): """ Release date time """ return self._released_at
class LinguistTask(BaseTask): """GitHub's tool to figure out what language is used in code.""" _analysis_name = 'languages' schema_ref = SchemaRef(_analysis_name, '1-0-0') def _parse_linguist(self, output): if not output: return None def extract_value(line): """Extract the language name: `language: Python` -> `Python`.""" return line.split(':', 1)[1].strip() lines_matcher = re.compile('(\d+) lines \((\d+) sloc\)') m = lines_matcher.search(output[0]) lines, sloc = 0, 0 if m: lines, sloc = int(m.groups(1)[0]), int(m.groups(2)[0]) tml = zip(['type', 'mime', 'language'], [extract_value(l) for l in output[1:4]]) data = dict(tml, lines=lines, sloc=sloc) return data def execute(self, arguments): """Start the task.""" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) results = [] cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() def worker(path): mime = TimedCommand.get_command_output(['file', path, '-b', '-i']).pop() self.log.debug("%s mime = %s", path, mime) typ = TimedCommand.get_command_output(['file', path, '-b']) self.log.debug("%s filetype = %s", path, typ) linguist = None if 'charset=binary' not in mime: linguist = self._parse_linguist( TimedCommand.get_command_output(['linguist', path]) ) self.log.debug("%s linguist output = %s", path, linguist) results.append({ "type": typ, "output": linguist, "path": os.path.relpath(path, cache_path), }) with ThreadPool(target=worker) as tp: for path in get_all_files_from(cache_path, path_filter=skip_git_files): tp.add_task(path) return {'summary': [], 'status': 'success', 'details': results}
class DigesterTask(BaseTask): """Computes various digests of all files found in target cache path.""" _analysis_name = 'digests' schema_ref = SchemaRef(_analysis_name, '1-0-0') def compute_ssdeep(self, target): """Compute SSdeep piece-wise linear hash of target.""" # 0 : ssdeep header # 1 : hash,filename data = TimedCommand.get_command_output(['ssdeep', '-c', '-s', target]) try: return data[1].split(',')[0].strip() except IndexError as exc: self.log.error("unable to compute ssdeep of %r", target) raise RuntimeError("can't compute digest of %r" % target) from exc def compute_digests(self, cache_path, f, artifact=False): """Compute digests of tarball f.""" f_digests = { 'sha256': compute_digest(f, 'sha256'), 'sha1': compute_digest(f, 'sha1'), 'md5': compute_digest(f, 'md5'), 'ssdeep': self.compute_ssdeep(f) } if artifact: f_digests['artifact'] = True f_digests['path'] = os.path.basename(f) else: f_digests['path'] = os.path.relpath(f, cache_path) return f_digests def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) epv_cache = ObjectCache.get_from_dict(arguments) # cache_path = epv_cache.get_extracted_source_tarball() results = [] # We don't compute digests of files in extracted tarball, only the tarball itself # for f in get_all_files_from(cache_path, path_filter=skip_git_files): # results.append(self.compute_digests(cache_path, f)) source_tarball_path = epv_cache.get_source_tarball() # Compute digests of tarball and mark it as such results.append( self.compute_digests(None, source_tarball_path, artifact=True)) return {'summary': [], 'status': 'success', 'details': results}
class LibrariesIoTask(BaseTask): """Collects statistics from Libraries.io.""" _analysis_name = "libraries_io" schema_ref = SchemaRef(_analysis_name, '2-0-0') @staticmethod def recent_releases(versions, count=10): """Sort versions by 'published_at' and return 'count' latest.""" version_list = sorted(versions, key=itemgetter('published_at'))[-count:] final_ver_list = [] for ver in version_list: tmp = { "number": ver['number'], "published_at": ver['published_at'] } final_ver_list.append(tmp) return final_ver_list def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) rdb_session = StoragePool.get_connected_storage( 'BayesianPostgres').session name = arguments['name'] ecosystem = arguments['ecosystem'] if ecosystem == 'go': name = quote(name, safe='') project_url = self.configuration.libraries_io_project_url( Ecosystem.by_name(rdb_session, ecosystem), name) project = get_response(project_url) versions = project['versions'] details = { 'dependent_repositories': { 'count': project['dependent_repos_count'] }, 'dependents': { 'count': project['dependents_count'] }, 'releases': { 'count': len(versions), 'recent': self.recent_releases(versions) } } return {'status': 'success', 'summary': [], 'details': details}
def test_schema_lookup(self, tmpdir): library = SchemaLibrary(str(tmpdir)) requested_schema = SchemaRef("example", "1-0-0") with pytest.raises(SchemaLookupError): library.load_schema(requested_schema) schema_path = tmpdir.join("example-v1-0-0.schema.json") dummy_schema = {"dummy-schema": "example"} serialized_schema = json.dumps(dummy_schema).encode('utf-8') schema_path.write_binary(serialized_schema) assert library.read_binary_schema( requested_schema) == serialized_schema assert library.load_schema(requested_schema) == dummy_schema
class KeywordsTaggingTask(KeywordsTaggingTaskBase): """Compute tags based on gathered natural text - package-version level keywords.""" _analysis_name = 'keywords_tagging' schema_ref = SchemaRef(_analysis_name, '1-0-0') def _package_version_level_keywords(self, keywords_file_name, stopwords_file_name, arguments): """Compute package-version level keywords from metadata.""" # Keep f8a_tagger import local as other components dependent on # f8a_worker do not require it installed. from f8a_tagger import lookup_text as keywords_lookup_text details = {} if 'metadata' in self.parent.keys(): details['description'] = {} metadata = self.parent_task_result('metadata') description = metadata.get('details', [{}])[0].get('description', '') if description: self.log.debug("Computing keywords from description: '%s'", description) details['description'] = keywords_lookup_text( description, keywords_file=keywords_file_name, stopwords_file=stopwords_file_name, **self._LOOKUP_CONF) # explicitly gather declared keywords by publisher self.log.debug( "Aggregating explicitly stated keywords by publisher") details['keywords'] = metadata.get('details', [{}])[0].get('keywords', []) return details def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) keywords_file_name, stopwords_file_name = self._get_config_files( arguments['ecosystem']) details = self._package_version_level_keywords(keywords_file_name, stopwords_file_name, arguments) return {'status': 'success', 'summary': [], 'details': details}
def test_bundled_schema_lookup(self, tmpdir): pkgdir = tmpdir.mkdir(tmpdir.basename) pkgdir.ensure("__init__.py") schemadir = pkgdir.mkdir("schemas") module = pkgdir.pyimport() library = BundledSchemaLibrary("schemas", module.__name__) requested_schema = SchemaRef("example", "1-0-0") with pytest.raises(SchemaLookupError): library.load_schema(requested_schema) schema_path = schemadir.join("example-v1-0-0.schema.json") dummy_schema = {"dummy-schema": "example"} serialized_schema = json.dumps(dummy_schema).encode('utf-8') schema_path.write_binary(serialized_schema) assert library.read_binary_schema( requested_schema) == serialized_schema assert library.load_schema(requested_schema) == dummy_schema
class BinwalkTask(BaseTask): """Find and extract interesting files / data from binary images.""" _analysis_name = 'binary_data' schema_ref = SchemaRef(_analysis_name, '1-0-0') @staticmethod def parse_binwalk(output): """Parse binwalk tool output and accumulate descriptions.""" if not output: return None import re matcher = re.compile(r'^\d{,8}\s*0x[A-Fa-f0-9]{,8}\s*(.*)$') matched = [] for line in output: match = matcher.match(line) if match: matched.append(match.groups(1)[0]) return matched def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball() results = [] for path in get_all_files_from(cache_path, path_filter=skip_git_files): self.log.debug("path = %s", path) bw = TimedCommand(['binwalk', '-B', path]) status, output, error = bw.run(timeout=60) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) parsed_binwalk = self.parse_binwalk(output) results.append({ "path": os.path.relpath(path, cache_path), "output": parsed_binwalk, }) return {'summary': [], 'status': 'success', 'details': results}
class StackAnalysesByGraphGET(ResourceWithSchema): method_decorators = [login_required] schema_ref = SchemaRef('stack_analyses', '2-1-4') @staticmethod def get(external_request_id): if get_request_count(rdb, external_request_id) < 1: raise HTTPError(404, "Invalid request ID '{t}'.".format(t=external_request_id)) stack_result = retrieve_worker_result(rdb, external_request_id, "stack_aggregator") reco_result = retrieve_worker_result(rdb, external_request_id, "recommendation") if stack_result is None and reco_result is None: raise HTTPError(202, "Analysis for request ID '{t}' is in progress".format(t=external_request_id)) if stack_result == -1 and reco_result == -1: raise HTTPError(404, "Worker result for request ID '{t}' doesn't exist yet".format(t=external_request_id)) started_at = None finished_at = None manifest_response = [] recommendations = {} if stack_result != None and 'task_result' in stack_result: if stack_result["task_result"] != None: started_at = stack_result["task_result"]["_audit"]["started_at"] finished_at = stack_result["task_result"]["_audit"]["ended_at"] manifest_response.append(stack_result["task_result"]) if reco_result is not None and 'task_result' in reco_result: if reco_result["task_result"] != None: recommendations = reco_result['task_result'] return { "started_at": started_at, "finished_at": finished_at, "request_id": external_request_id, "result": manifest_response, "recommendation": recommendations }
class BlackDuckProject(object): """ Project contains information about specific {ecosystem}-{package} pair """ @schema.input(SchemaRef("blackduck-project", "1-0-0")) def __init__(self, json_data): self._source = json_data self._name = json_data['name'] self._id = json_data['id'] self._canonical_release_id = json_data['canonicalReleaseId'] self._urls = {k: v for k, v in json_data.items() if k.endswith('Url')} @property def name(self): """ Name of the project """ return self._name @property def id(self): """ Unique identifier of the project """ return self._id @property def urls(self): """ Flat list of additional URLs for this project """ return self._urls @property def canonical_release_id(self): """ Latest release for the given project (in terms of version number) """ return self._canonical_release_id @property def source(self): """ Source JSON from which this object was parsed """ return self._source
def test_bundled_dynamic_schema_lookup(self, tmpdir, monkeypatch): """Tests for bundled dynamic schema lookup.""" pkgdir = tmpdir.mkdir(tmpdir.basename) pkgdir.ensure("__init__.py") schemadir = pkgdir.mkdir("schemas") schemadir.ensure("__init__.py") library = BundledDynamicSchemaLibrary('.'.join( [tmpdir.basename, "schemas"])) schema1 = SchemaRef("example", "1-0-0") schema2 = SchemaRef("example2", "1-0-0") schema3 = SchemaRef("example3", "1-0-0") schema4 = SchemaRef("example4", "1-0-0") schema5 = SchemaRef("example4", "2-0-0") # intentionally example4 schema6 = SchemaRef("example6", "2-0-0") with pytest.raises(SchemaImportError): library.load_schema_class_and_role(schema1) # sch2 doesn't have the ROLE_v1_0_0 variable sch2 = "import jsl;\nclass Schema(jsl.Document):\n x = jsl.StringField()\n" # sch3 doesn't have THE_SCHEMA variable sch3 = sch2 + "\nROLE_v1_0_0 = 'v1-0-0'\n" # sch4 is ok sch4 = sch3 + "\nTHE_SCHEMA = Schema\n" # no sch5; sch6 is ok and has two roles sch6 = sch4 + "\nROLE_v2_0_0 = 'v2-0-0'\n" schemadir.join("example2.py").write(sch2) schemadir.join("example3.py").write(sch3) schemadir.join("example4.py").write(sch4) schemadir.join("example6.py").write(sch6) monkeypatch.syspath_prepend(pkgdir.dirname) with pytest.raises(SchemaModuleAttributeError): library.load_schema_class_and_role(schema2) with pytest.raises(SchemaModuleAttributeError): library.load_schema_class_and_role(schema3) klass, role = library.load_schema_class_and_role(schema4) assert "x" in dir(klass) assert role == "v1-0-0" with pytest.raises(SchemaModuleAttributeError): # example 5 is the same as example 4, but doesn't have the required version 2-0-0 library.load_schema_class_and_role(schema5) klass6, role6 = library.load_schema_class_and_role(schema6) assert "x" in dir(klass) assert role6 == "v2-0-0"
class BlackDuckTask(BaseTask): """ Scan the package using Black Duck """ _analysis_name = 'blackduck' _valid_ecosystems = ["npm", "maven", "pypi"] _allow_cli_scan = True schema_ref = SchemaRef(_analysis_name, '1-0-0') _BLACKDUCK_CLI_TIMEOUT = 600 def _format_hub_url(self): """ Format Hub connection string from supplied config :return: """ return "{scheme}://{host}:{port}/".format( scheme=self.configuration.BLACKDUCK_SCHEME, host=self.configuration.BLACKDUCK_HOST, port=self.configuration.BLACKDUCK_PORT) def _is_valid_ecosystem(self, ecosystem_id): """ Determine whether the given ecosystem is valid for Black Duck analysis :param ecosystem_id: int, the ID of the ecosystem :return: bool """ return ecosystem_id in self._valid_ecosystems def _find_blackduck_cli_root(self): """ Find the base directory where the BlackDuck CLI got extracted :return: str, path to the CLI root """ base = self.configuration.BLACKDUCK_PATH dirs = listdir(base) if not dirs: raise TaskError("Unable to find BlackDuck CLI directory") if len(dirs) > 1: raise TaskError("More than 1 BlackDuck CLI directory") return path.join(base, dirs.pop()) def _prepare_command(self, project, version, archive): """ Prepare the necessary CLI parameters :param project: str, name of the project :param version: str, version of the release :param archive: str, path to the archive with the sources :return: List[str], command list ready to be run """ binary = "{base}/{rel}".format(base=self._find_blackduck_cli_root(), rel="bin/scan.cli.sh") return [ binary, "--host", self.configuration.BLACKDUCK_HOST, "--port", str(int(self.configuration.BLACKDUCK_PORT)), "--scheme", self.configuration.BLACKDUCK_SCHEME, "--username", self.configuration.BLACKDUCK_USERNAME, "--project", project, "--release", version, archive ] def _get_release(self, hub, project, version): """ Get release ID for given project version :param hub: BlackDuckHub, hub object to use :param project: str, name of the project :param version: str, version :return: BlackDuckRelease object or None if not found """ # check that the specified project exists proj = hub.find_project(project) if not proj: return None # check that we have the proper version releases = hub.get_releases(proj) return releases.get(version, None) def _release_data(self, hub, project, version): """ Fetch release data for the given project and version :param hub: BlackDuckHub, hub object to use :param project: str, name of the project :param version: str, version :return: dict, BoM information about the release """ release = self._get_release(hub, project, version) if release is None: return None return hub.get_release_bom_json(release) def _get_hub(self): # connect to the Black Duck Hub hub_url = self._format_hub_url() self.log.debug("hub url: {url}".format(url=hub_url)) hub = BlackDuckHub(hub_url) hub.connect_session(self.configuration.BLACKDUCK_USERNAME, self.configuration.BLACKDUCK_PASSWORD) return hub def _get_project_name(self, arguments): return "{ecosystem}-{package}".format(ecosystem=arguments['ecosystem'], package=arguments['name']) def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) result_data = {'status': 'unknown', 'summary': [], 'details': {}} if self._is_valid_ecosystem(arguments['ecosystem']): hub = self._get_hub() # BlackDuck project doesn't have a notion of ecosystem, so we need to # namespace the project names ourselves, so for package `crumb` in the NPM ecosystem # we'll end up with the name `npm-crumb` project = self._get_project_name(arguments) version = arguments['version'] # Check if the given project had already been scanned data = self._release_data(hub, project, version) if not data and self._allow_cli_scan: self.log.debug("No data available for project {p} {v}".format( p=project, v=version)) # No data available, issue a new scan and re-query release data source_tarball_path = ObjectCache.get_from_dict( arguments).get_source_tarball() command = self._prepare_command(project, version, source_tarball_path) self.log.debug( "Executing command, timeout={timeout}: {cmd}".format( timeout=self._BLACKDUCK_CLI_TIMEOUT, cmd=command)) bd = TimedCommand(command) status, output, error = \ bd.run(timeout=self._BLACKDUCK_CLI_TIMEOUT, update_env={'BD_HUB_PASSWORD': self.configuration.BLACKDUCK_PASSWORD}) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) data = self._release_data(hub, project, version) self.log.debug("Release data for project {p} {v}: {d}".format( p=project, v=version, d=data)) result_data['details'] = data result_data['status'] = 'success' if data else 'error' else: result_data['status'] = 'error' return result_data
class GithubTask(BaseTask): """Collects statistics using Github API.""" _analysis_name = "github_details" schema_ref = SchemaRef(_analysis_name, '2-0-2') # used for testing _repo_name = None _repo_url = None _headers = { 'Accept': 'application/vnd.github.mercy-preview+json, ' # for topics 'application/vnd.github.v3+json' # recommended by GitHub for License API } @classmethod def create_test_instance(cls, repo_name, repo_url): """Create instance of task for tests.""" assert cls instance = super().create_test_instance() # set for testing as we are not querying DB for mercator results instance._repo_name = repo_name instance._repo_url = repo_url return instance def _get_last_years_commits(self, repo_url): """Get weekly commit activity for last year.""" try: activity = get_response( urljoin(repo_url + '/', "stats/commit_activity"), self._headers) except NotABugTaskError as e: self.log.debug(e) return [] return [x['total'] for x in activity] def _get_repo_stats(self, repo): """Collect various repository properties.""" try: if repo.get('contributors_url', ''): contributors = get_response(repo.get('contributors_url', ''), self._headers) else: contributors = {} except NotABugTaskError as e: self.log.debug(e) contributors = {} d = { 'contributors_count': len(list(contributors)) if contributors is not None else 'N/A' } for prop in REPO_PROPS: d[prop] = repo.get(prop, -1) return d def _get_repo_name(self, url): """Retrieve GitHub repo from a preceding Mercator scan.""" parsed = parse_gh_repo(url) if not parsed: self.log.debug('Could not parse Github repo URL %s', url) else: self._repo_url = 'https://github.com/' + parsed return parsed def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ result_data = {'status': 'unknown', 'summary': [], 'details': {}} # For testing purposes, a repo may be specified at task creation time if self._repo_name is None: # Otherwise, get the repo name from earlier Mercator scan results self._repo_name = self._get_repo_name(arguments['url']) if self._repo_name is None: # Not a GitHub hosted project return result_data try: _, header = self.configuration.select_random_github_token() self._headers.update(header) except F8AConfigurationException as e: self.log.error(e) raise FatalTaskError from e repo_url = urljoin(self.configuration.GITHUB_API + "repos/", self._repo_name) try: repo = get_response(repo_url, self._headers) except NotABugTaskError as e: self.log.error(e) raise NotABugFatalTaskError from e result_data['status'] = 'success' issues = {} # Get Repo Statistics notoriety = self._get_repo_stats(repo) if notoriety: issues.update(notoriety) issues['topics'] = repo.get('topics', []) issues['license'] = repo.get('license') or {} # Get Commit Statistics last_year_commits = self._get_last_years_commits(repo['url']) commits = { 'last_year_commits': { 'sum': sum(last_year_commits), 'weekly': last_year_commits } } t_stamp = datetime.datetime.utcnow() refreshed_on = {'updated_on': t_stamp.strftime("%Y-%m-%d %H:%M:%S")} issues.update(refreshed_on) issues.update(commits) result_data['details'] = issues return result_data
class MercatorTask(BaseTask): """Collects `Release` specific information from Mercator.""" _analysis_name = 'metadata' _dependency_tree_lock = '_dependency_tree_lock' schema_ref = SchemaRef(_analysis_name, '3-3-0') _data_normalizer = DataNormalizer() def _parse_requires_txt(self, path): requires = [] try: with open(path, 'r') as f: for line in f.readlines(): line = line.strip() if line.startswith('['): # the first named ini-like [section] ends the runtime requirements break elif line: requires.append(line) except Exception as e: self.log.warning('Failed to process "{p}": {e}'.format(p=path, e=str(e))) return requires def _merge_python_items(self, topdir, data): # TODO: reduce cyclomatic complexity metadata_json = None pkg_info = None requirements_txt = None def get_depth(path): return path.rstrip('/').count('/') def is_deeper(item1, item2): """Return True if item1 is deeper in directory hierarchy than item2.""" if item1 is None: return True return get_depth(item1['path']) > get_depth(item2['path']) if not data.get('items'): return None # find outermost PKG_INFO/metadata.json/requirements.txt - there can be # testing ones etc. for item in data['items']: if item['ecosystem'] == 'Python-Dist' and item['path'].endswith( '.json'): if is_deeper(metadata_json, item): metadata_json = item elif item['ecosystem'] == 'Python-Dist': # PKG-INFO # we prefer PKG_INFO files from .egg-info directories, # since these have the very useful `requires.txt` next to them if pkg_info is None: pkg_info = item else: pkg_info_in_egg = pkg_info['path'].endswith( '.egg-info/PKG-INFO') item_in_egg = item['path'].endswith('.egg-info/PKG-INFO') # rather than one insane condition, we use several less complex ones if pkg_info_in_egg and item_in_egg and is_deeper( pkg_info, item): # if both are in .egg-info, but current pkg_info is deeper pkg_info = item elif item_in_egg and not pkg_info_in_egg: # if item is in .egg-info and current pkg_info is not pkg_info = item elif not (item_in_egg or pkg_info_in_egg) and is_deeper( pkg_info, item): # if none of them are in .egg-info, but current pkg_info is deeper pkg_info = item elif item['ecosystem'] == 'Python-RequirementsTXT': if not requirements_txt or is_deeper(requirements_txt, item): requirements_txt = item if pkg_info: self.log.info('Found PKG-INFO at {p}'.format(p=pkg_info['path'])) if metadata_json: self.log.info( 'Found metadata.json at {p}'.format(p=metadata_json['path'])) if requirements_txt: self.log.info('Found requirements.txt at {p}'.format( p=requirements_txt['path'])) ret = None # figure out if this was packaged as wheel => metadata.json would # have depth of topdir + 2 if metadata_json and get_depth( metadata_json['path']) == get_depth(topdir) + 2: self.log.info('Seems like this is wheel, using metadata.json ...') ret = metadata_json # figure out if this was packaged as sdist => PKG_INFO would # have depth of topdir + 3 (e.g. requests-2.18.1/requests.egg-info/PKG-INFO) # or topdir + 4 (e.g. pydocstyle-2.0.0/src/pydocstyle.egg-info/PKG-INFO) # or topdir + 5 (dxl-cluster-0.0.2/src/python/dxl_cluster.egg-info/PKG-INFO) # (and perhaps there are requires.txt or requirements.txt that we could use) # NOTE: for now, we always treat requirements.txt as requires_dist elif pkg_info and get_depth(pkg_info['path']) <= get_depth(topdir) + 5: self.log.info( 'Seems like this is sdist or egg, using PKG-INFO ...') requires_dist = [] # in well-made sdists, there are requires.txt next to PKG_INFO # (this is something different that requirements.txt) # TODO: maybe mercator could do this in future requires = os.path.join(os.path.dirname(pkg_info['path']), 'requires.txt') if os.path.exists(requires): self.log.info( 'Found a "requires.txt" file next to PKG-INFO, going to use it ...' ) requires_dist = self._parse_requires_txt(requires) elif requirements_txt: self.log.info( 'No "requires.txt" file found next to PKG-INFO, but requirements.txt' ' found, going to use it') # if requires.txt can't be found, try requirements.txt requires_dist = requirements_txt['result']['dependencies'] else: self.log.info( 'Found no usable source of requirements for PKG-INFO :(') pkg_info['result']['requires_dist'] = requires_dist ret = pkg_info elif requirements_txt: self.log.info('Only requirements.txt found, going to use it ...') requirements_txt['result']['requires_dist'] = \ requirements_txt['result'].get('dependencies') ret = requirements_txt return ret def execute(self, arguments): """Execute mercator and convert it's output to JSON object.""" self._strict_assert(arguments.get('ecosystem')) if 'url' in arguments: # run mercator on a git repo return self.run_mercator_on_git_repo(arguments) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) # TODO: make this even uglier; looks like we didn't get the abstraction quite right # when we were adding support for Java/Maven. if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by( EcosystemBackend.maven): # cache_path now points directly to the pom cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml() else: cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() return self.run_mercator(arguments, cache_path) def run_mercator_on_git_repo(self, arguments): """Clone specified git url and run mercator on it.""" self._strict_assert(arguments.get('url')) with TemporaryDirectory() as workdir: repo_url = arguments.get('url') repo = Git.clone(repo_url, path=workdir, depth=str(1)) metadata = self.run_mercator(arguments, workdir, keep_path=True, outermost_only=False, timeout=900) if metadata.get('status', None) != 'success': self.log.error('Mercator failed on %s', repo_url) return None # add some auxiliary information so we can later find the manifest file head = repo.rev_parse(['HEAD'])[0] for detail in metadata['details']: path = detail['path'][len(workdir):] # path should look like this: # <git-sha1>/path/to/manifest.file detail['path'] = head + path return metadata def run_mercator(self, arguments, cache_path, keep_path=False, outermost_only=True, timeout=300, resolve_poms=True): """Run mercator tool.""" # TODO: reduce cyclomatic complexity result_data = {'status': 'unknown', 'summary': [], 'details': []} mercator_target = arguments.get('cache_sources_path', cache_path) tc = TimedCommand(['mercator', mercator_target]) update_env = { 'MERCATOR_JAVA_RESOLVE_POMS': 'true' } if resolve_poms else {} status, data, err = tc.run(timeout=timeout, is_json=True, update_env=update_env) if status != 0: self.log.error(err) raise FatalTaskError(err) ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem']) if ecosystem_object.is_backed_by(EcosystemBackend.pypi): # TODO: attempt static setup.py parsing with mercator items = [self._merge_python_items(mercator_target, data)] if items == [None]: raise NotABugFatalTaskError( 'Found no usable PKG-INFO/metadata.json/requirements.txt') else: if outermost_only: # process only root level manifests (or the ones closest to the root level) items = self._data_normalizer.get_outermost_items( data.get('items') or []) else: items = data.get('items') or [] self.log.debug('mercator found %i projects, outermost %i', len(data), len(items)) if ecosystem_object.is_backed_by(EcosystemBackend.maven): # for maven we download both Jar and POM, we consider POM to be *the* # source of information and don't want to duplicate info by including # data from pom included in artifact (assuming it's included) items = [ d for d in items if d['ecosystem'].lower() == 'java-pom' ] elif ecosystem_object.is_backed_by(EcosystemBackend.npm): # ignore other metadata files, e.g. requirements.txt items = [d for d in items if d['ecosystem'].lower() == 'npm'] elif arguments['ecosystem'] == 'go': items = [ d for d in items if d['ecosystem'].lower() == 'go-glide' ] if not items: # Mercator found no Go Glide files, run gofedlib items = self.run_gofedlib(topdir=mercator_target, name=arguments.get('name'), version=arguments.get('version'), timeout=timeout) result_data['details'] = [ self._data_normalizer.handle_data(d, keep_path=keep_path) for d in items ] result_data['status'] = 'success' return result_data def run_gofedlib(self, topdir, name, version, timeout): """Run gofedlib-cli to extract dependencies from golang sources.""" tc = TimedCommand([ 'gofedlib-cli', '--dependencies-main', '--dependencies-packages', '--dependencies-test', '--skip-errors', topdir ]) status, data, err = tc.run(timeout=timeout) result = json.loads(data[0]) main_deps_count = len(result.get('deps-main', [])) packages_count = len(result.get('deps-packages', [])) self.log.debug('gofedlib found %i dependencies', main_deps_count + packages_count) result['code_repository'] = { 'type': 'git', 'url': 'https://{name}'.format(name=name) } result['name'] = name result['version'] = version return [{'ecosystem': 'gofedlib', 'result': result}]
class DependencySnapshotTask(BaseTask): """Task that analyzes dependencies.""" _analysis_name = 'dependency_snapshot' schema_ref = SchemaRef(_analysis_name, '1-0-0') def _collect_dependencies(self): """Return all dependencies for current analysis flow (operates on parent mercator result). :return: List[str], list of dependencies """ wr = self.parent_task_result('metadata') if not isinstance(wr, dict): raise TaskError( 'metadata task result has unexpected type: {}; expected dict'. format(type(wr))) # there can be details about multiple manifests in the metadata, # therefore we will collect dependency specifications from all of them # and exclude obvious duplicates along the way dependencies = list({ dep for m in wr.get('details', []) if m.get('dependencies') for dep in m.get('dependencies', []) }) return dependencies @staticmethod def _resolve_dependency(ecosystem, dep): ret = { 'ecosystem': ecosystem.name, 'declaration': dep, 'resolved_at': json_serial(datetime.datetime.utcnow()) } # first, if this is a Github dependency, return it right away (we don't resolve these yet) if ' ' in dep: # we have both package name and version (version can be an URL) name, spec = dep.split(' ', 1) if gh_dep.match(spec): ret['name'] = name ret['version'] = 'https://github.com/' + spec elif urllib.parse.urlparse(spec).scheme is not '': ret['name'] = name ret['version'] = spec else: if gh_dep.match(dep): ret['name'] = 'https://github.com/' + dep ret['version'] = None elif urllib.parse.urlparse(dep).scheme is not '': ret['name'] = dep ret['version'] = None if 'name' in ret: return ret # second, figure out what is the latest upstream version matching the spec and return it solver = get_ecosystem_solver(ecosystem) pkgspec = solver.solve([dep]) if not pkgspec: raise TaskError("invalid dependency: {}".format(dep)) package, version = pkgspec.popitem() if not version: raise TaskError("could not resolve {}".format(dep)) ret['name'] = package ret['version'] = version return ret def execute(self, arguments): """Start the task that analyzes dependencies. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) result = { 'summary': { 'errors': [], 'dependency_counts': {} }, 'status': 'success', 'details': {} } ecosystem = self.storage.get_ecosystem(arguments.get('ecosystem')) try: deps = self._collect_dependencies() except TaskError as e: self.log.error(str(e)) raise FatalTaskError from e resolved_deps = [] for dep in deps: try: resolved = self._resolve_dependency(ecosystem, dep) except TaskError as e: self.log.error(str(e)) result['summary']['errors'].append(str(e)) result['status'] = 'error' # Is this fatal, i.e. should we 'raise FatalTaskError from e' ? break self.log.info('resolved dependency %r as %s', dep, resolved) resolved_deps.append(resolved) # in future, we may want to provide also build/test dependencies, not just runtime result['details']['runtime'] = resolved_deps result['summary']['dependency_counts']['runtime'] = len(resolved_deps) return result
class PackageKeywordsTaggingTask(KeywordsTaggingTaskBase): """Compute tags based on gathered natural text - strictly package level keywords.""" _analysis_name = 'package_keywords_tagging' schema_ref = SchemaRef(_analysis_name, '1-0-0') def _package_level_keywords(self, keywords_file_name, stopwords_file_name, arguments): # Keep f8a_tagger import local as other components dependent on # f8a_worker do not require it installed. from f8a_tagger import lookup_readme as keywords_lookup_readme from f8a_tagger import lookup_text as keywords_lookup_text details = {} package_postgres = StoragePool.get_connected_storage('PackagePostgres') gh_info = package_postgres.get_task_result_by_analysis_id(arguments['ecosystem'], arguments['name'], 'github_details', arguments['document_id']) if gh_info: self.log.debug("Aggregating explicitly stated keywords (topics) on GitHub") details['gh_topics'] = gh_info.get('details', {}).get('topics', []) s3_readme = StoragePool.get_connected_storage('S3Readme') try: readme_json = s3_readme.retrieve_readme_json(arguments['ecosystem'], arguments['name']) if readme_json: self.log.debug("Computing keywords from README.json") details['README'] = keywords_lookup_readme(readme_json, keywords_file=keywords_file_name, stopwords_file=stopwords_file_name, **self._LOOKUP_CONF) except Exception as exc: self.log.info("Failed to retrieve README: %s", str(exc)) s3_rd = StoragePool.get_connected_storage('S3RepositoryDescription') try: description = s3_rd.retrieve_repository_description(arguments['ecosystem'], arguments['name']) if description: self.log.debug("Computing keywords on description from repository") details['repository_description'] = keywords_lookup_text( description, keywords_file=keywords_file_name, stopwords_file=stopwords_file_name, **self._LOOKUP_CONF) except Exception as exc: self.log.info("Failed to retrieve repository description: %s", str(exc)) if self.task_name == 'package_keywords_tagging': # We are tagging on package level, add also tags that are found in package name name_parts = re.split('[\.\-_:]', arguments['name']) self.log.debug("Computing keywords from package name %s", name_parts) details['package_name'] = keywords_lookup_text(" ".join(name_parts), keywords_file=keywords_file_name, stopwords_file=stopwords_file_name, **self._LOOKUP_CONF) return details def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) keywords_file_name, stopwords_file_name = self._get_config_files(arguments['ecosystem']) details = self._package_level_keywords(keywords_file_name, stopwords_file_name, arguments) return {'status': 'success', 'summary': [], 'details': details}
class CVEcheckerTask(BaseTask): """Security issues scanner.""" _analysis_name = 'security_issues' schema_ref = SchemaRef(_analysis_name, '3-0-1') dependency_check_jvm_mem_limit = '-Xmx768m' @staticmethod def get_cve_impact(cve_id): """Get more details about cve_id from NVD.""" score = 0 vector = '' severity = '' if cve_id: url = "https://nvd.nist.gov/vuln/detail/{cve_id}".format(cve_id=cve_id) response = requests.get(url) if not response.status_code == 200: raise IOError('Unable to reach URL: {url}'.format(url=url)) score_v3 = score_v2 = 0 severity_v3 = severity_v2 = vector_v3 = vector_v2 = '' page = BeautifulSoup(response.text, 'html.parser') for tag in page.find_all(href=re_compile('calculator')): if tag.attrs.get('data-testid') == 'vuln-cvssv3-base-score-link': score_v3 = float(tag.text.strip()) severity_v3 = tag.find_next().text.lower() elif tag.attrs.get('data-testid') == 'vuln-cvssv3-vector-link': vector_v3 = tag.text.strip() elif tag.attrs.get('data-testid') == 'vuln-cvssv2-base-score-link': score_v2 = float(tag.text.strip()) severity_v2 = tag.find_next().text.lower() elif tag.attrs.get('data-testid') == 'vuln-cvssv2-vector-link': vector_v2 = tag.text.strip().lstrip('(').rstrip(')') # Prefer CVSS v3.0 over v2 score = score_v3 or score_v2 severity = severity_v3 or severity_v2 vector = vector_v3 or vector_v2 return score, vector, severity @staticmethod def _filter_ossindex_fields(entry): """Create a result record for ossindex entry.""" score, vector, severity = CVEcheckerTask.get_cve_impact(entry.get('cve')) result = { 'id': entry.get('cve') or entry.get('title'), 'description': entry.get('description'), 'references': entry.get('references'), 'cvss': { 'score': score, 'vector': vector }, 'severity': severity } return result @staticmethod def _filter_victims_db_entry(entry): """Create a result record for ossindex entry.""" if 'cve' not in entry: return None _, vector, severity = CVEcheckerTask.get_cve_impact(entry.get('cve')) result = { 'id': 'CVE-' + entry['cve'], 'description': entry.get('description'), 'references': entry.get('references'), 'cvss': { 'score': entry.get('cvss_v3') or entry.get('cvss_v2'), 'vector': vector }, 'severity': severity, 'attribution': "https://github.com/victims/victims-cve-db, CC BY-SA 4.0, modified" } return result @staticmethod def query_url(url): """Query url and return json.""" response = requests.get(url) response.raise_for_status() return response.json() @staticmethod def _query_ossindex_package(ecosystem, name): """Get vulnerabilities for a given package ecosystem:name from OSSIndex.""" url = "https://ossindex.net/v2.0/package/{pm}/{package}".format(pm=ecosystem, package=name) return CVEcheckerTask.query_url(url) @staticmethod def query_ossindex_vulnerability_fromtill(ecosystem, from_time=0, till_time=-1): """From OSSIndex get vulnerabilities which changed between from_time and till_time.""" # OSS Index uses timestamp in milliseconds from_time = int(from_time * 1000) till_time = int(till_time * 1000) url = "https://ossindex.net/v2.0/vulnerability/pm/{pm}/fromtill/{from_time}/{till_time}".\ format(pm=ecosystem, from_time=from_time, till_time=till_time) packages = [] while url: response = CVEcheckerTask.query_url(url) for package in response.get('packages', []): for vulnerability in package.get('vulnerabilities', []): # Sanity check: # the response always contains at least one entry, even if it should be empty # (when 'from_time' is higher than 'updated' time of all entries in db) if int(vulnerability.get('updated')) < from_time: package['vulnerabilities'].remove(vulnerability) if package.get('vulnerabilities', []): packages.append(package) url = response.get('next') return packages def _query_ossindex(self, arguments): """Query OSS Index REST API.""" entries = {} solver = get_ecosystem_solver(self.storage.get_ecosystem(arguments['ecosystem']), with_parser=OSSIndexDependencyParser()) for package in self._query_ossindex_package(arguments['ecosystem'], arguments['name']): for vulnerability in package.get('vulnerabilities', []): for version_string in vulnerability.get('versions', []): try: affected_versions = solver.solve(["{} {}".format(arguments['name'], version_string)], all_versions=True) except Exception: self.log.exception("Failed to resolve %r for %s:%s", version_string, arguments['ecosystem'], arguments['name']) continue if arguments['version'] in affected_versions.get(arguments['name'], []): entry = self._filter_ossindex_fields(vulnerability) if entry.get('id'): entries[entry['id']] = entry return {'summary': list(entries.keys()), 'status': 'success', 'details': list(entries.values())} def _npm_scan(self, arguments): """Get vulnerabilities info about given npm package.""" return self._query_ossindex(arguments) @staticmethod def update_depcheck_db_on_s3(): """Update OWASP Dependency-check DB on S3.""" s3 = StoragePool.get_connected_storage('S3VulnDB') depcheck = configuration.dependency_check_script_path with TemporaryDirectory() as temp_data_dir: s3.retrieve_depcheck_db_if_exists(temp_data_dir) old_java_opts = os.getenv('JAVA_OPTS', '') os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit # give DependencyCheck 25 minutes to download the DB if TimedCommand.get_command_output([depcheck, '--updateonly', '--data', temp_data_dir], timeout=1500): s3.store_depcheck_db(temp_data_dir) os.environ['JAVA_OPTS'] = old_java_opts def _run_owasp_dep_check(self, scan_path, experimental=False): """Run OWASP Dependency-Check.""" def _clean_dep_check_tmp(): for dcdir in glob(os.path.join(gettempdir(), 'dctemp*')): rmtree(dcdir) s3 = StoragePool.get_connected_storage('S3VulnDB') depcheck = configuration.dependency_check_script_path with TemporaryDirectory() as temp_data_dir: if not s3.retrieve_depcheck_db_if_exists(temp_data_dir): self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...') self.update_depcheck_db_on_s3() s3.retrieve_depcheck_db_if_exists(temp_data_dir) report_path = os.path.join(temp_data_dir, 'report.xml') command = [depcheck, '--noupdate', '--format', 'XML', '--project', 'CVEcheckerTask', '--data', temp_data_dir, '--scan', scan_path, '--out', report_path] if experimental: command.extend(['--enableExperimental']) for suppress_xml in glob(os.path.join(os.environ['OWASP_DEP_CHECK_SUPPRESS_PATH'], '*.xml')): command.extend(['--suppress', suppress_xml]) output = [] old_java_opts = os.getenv('JAVA_OPTS', '') try: self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' % scan_path) os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit output = TimedCommand.get_command_output(command, graceful=False, timeout=600) # 10 minutes with open(report_path) as r: report_dict = anymarkup.parse(r.read()) except (TaskError, FileNotFoundError) as e: _clean_dep_check_tmp() for line in output: self.log.warning(line) self.log.exception(str(e)) raise FatalTaskError('OWASP Dependency-Check scan failed') from e finally: os.environ['JAVA_OPTS'] = old_java_opts _clean_dep_check_tmp() results = [] dependencies = report_dict.get('analysis', {}).get('dependencies') # value can be None dependencies = dependencies.get('dependency', []) if dependencies else [] if not isinstance(dependencies, list): dependencies = [dependencies] for dependency in dependencies: vulnerabilities = dependency.get('vulnerabilities') # value can be None vulnerabilities = vulnerabilities.get('vulnerability', []) if vulnerabilities else [] if not isinstance(vulnerabilities, list): vulnerabilities = [vulnerabilities] for vulnerability in vulnerabilities: av = vulnerability.get('cvssAccessVector') av = av[0] if av else '?' ac = vulnerability.get('cvssAccessComplexity') ac = ac[0] if ac else '?' au = vulnerability.get('cvssAuthenticationr') au = au[0] if au else '?' c = vulnerability.get('cvssConfidentialImpact') c = c[0] if c else '?' i = vulnerability.get('cvssIntegrityImpact') i = i[0] if i else '?' a = vulnerability.get('cvssAvailabilityImpact') a = a[0] if a else '?' vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{Integrity}/A:{A}".\ format(AV=av, AC=ac, Au=au, C=c, Integrity=i, A=a) result = { 'cvss': { 'score': vulnerability.get('cvssScore'), 'vector': vector } } references = vulnerability.get('references', {}).get('reference', []) if not isinstance(references, list): references = [references] result['references'] = [r.get('url') for r in references] for field in ['severity', 'description']: result[field] = vulnerability.get(field) result['id'] = vulnerability.get('name') results.append(result) return {'summary': [r['id'] for r in results], 'status': 'success', 'details': results} @staticmethod def update_victims_cve_db_on_s3(): """Update Victims CVE DB on S3.""" repo_url = 'https://github.com/victims/victims-cve-db.git' s3 = StoragePool.get_connected_storage('S3VulnDB') with TemporaryDirectory() as temp_dir: Git.clone(repo_url, temp_dir, depth="1") s3.store_victims_db(temp_dir) def _run_victims_cve_db_cli(self, arguments): """Run Victims CVE DB CLI.""" s3 = StoragePool.get_connected_storage('S3VulnDB') output = [] with TemporaryDirectory() as temp_victims_db_dir: if not s3.retrieve_victims_db_if_exists(temp_victims_db_dir): self.log.debug('No Victims CVE DB found on S3, cloning from github') self.update_victims_cve_db_on_s3() s3.retrieve_victims_db_if_exists(temp_victims_db_dir) try: cli = os.path.join(temp_victims_db_dir, 'victims-cve-db-cli.py') command = [cli, 'search', '--ecosystem', 'java', '--name', arguments['name'], '--version', arguments['version']] output = TimedCommand.get_command_output(command, graceful=False, is_json=True, timeout=60) # 1 minute except TaskError as e: self.log.exception(e) return output def _maven_scan(self, arguments): """Run OWASP dependency-check & Victims CVE DB CLI.""" jar_path = ObjectCache.get_from_dict(arguments).get_source_tarball() results = self._run_owasp_dep_check(jar_path, experimental=False) if results.get('status') != 'success': return results # merge with Victims CVE DB results victims_cve_db_results = self._run_victims_cve_db_cli(arguments) for vulnerability in victims_cve_db_results: vulnerability = self._filter_victims_db_entry(vulnerability) if not vulnerability: continue if vulnerability['id'] not in results['summary']: results['summary'].append(vulnerability['id']) results['details'].append(vulnerability) return results def _python_scan(self, arguments): """Run OWASP dependency-check experimental analyzer for Python artifacts. https://jeremylong.github.io/DependencyCheck/analyzers/python.html """ extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() # depcheck needs to be pointed to a specific file, we can't just scan whole directory egg_info = pkg_info = metadata = None for root, _, files in os.walk(extracted_tarball): if root.endswith('.egg-info') or root.endswith('.dist-info'): egg_info = root if 'PKG-INFO' in files: pkg_info = os.path.join(root, 'PKG-INFO') if 'METADATA' in files: metadata = os.path.join(root, 'METADATA') scan_path = egg_info or pkg_info or metadata if pkg_info and not egg_info: # Work-around for dependency-check ignoring PKG-INFO outside .dist-info/ # https://github.com/jeremylong/DependencyCheck/issues/896 egg_info_dir = os.path.join(extracted_tarball, arguments['name'] + '.egg-info') try: os.mkdir(egg_info_dir) copy(pkg_info, egg_info_dir) scan_path = egg_info_dir except os.error: self.log.warning('Failed to copy %s to %s', pkg_info, egg_info_dir) if not scan_path: raise FatalTaskError('File types not supported by OWASP dependency-check') return self._run_owasp_dep_check(scan_path, experimental=True) def _nuget_scan(self, arguments): """Get vulnerabilities info about given nuget package.""" return self._query_ossindex(arguments) def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) if arguments['ecosystem'] == 'maven': return self._maven_scan(arguments) elif arguments['ecosystem'] == 'npm': return self._npm_scan(arguments) elif arguments['ecosystem'] == 'pypi': return self._python_scan(arguments) elif arguments['ecosystem'] == 'nuget': return self._nuget_scan(arguments) else: raise RequestError('Unsupported ecosystem')
class CodeMetricsTask(BaseTask): """Compute various code metrics for a project.""" _analysis_name = 'code_metrics' schema_ref = SchemaRef(_analysis_name, '1-0-0') _CLI_TIMEOUT = 300 def _run_analyzer(self, command, json_output=True): """Run command (analyzer), if a JSON output is expected, parse it. :param command: command to be run (command with argument vector as array) :param json_output: True if output should be parsed :return: status, output, error triplet """ self.log.debug("Executing command, timeout={timeout}: {cmd}".format( timeout=self._CLI_TIMEOUT, cmd=command)) cmd = TimedCommand(command) status, output, error = cmd.run(timeout=self._CLI_TIMEOUT) self.log.debug("status: %d, output: %s, error: %s", status, output, error) if status != 0: self.log.warning( "Executing command failed, return value: %d, stderr: '%s' ", status, error) # Some tools such as complexity-report write zero bytes to output (they # are propagated from sources like for npm/glob/7.0.3). This caused # failures when pushing results to Postgres as Postgres cannot store # null bytes in results. Let's be safe here. output = list(line.replace('\\u0000', '\\\\0') for line in output) if json_output: if output: output = "".join(output) output = json.loads(output) else: output = {} return status, output, error def _get_generic_result(self, source_path): """Get core result of CodeMetricsTask task that is based on cloc tool. This output is later enriched with output of tools based on languages that were found by cloc :param source_path: path to sources where analyzed artefact resists :return: tuple where generic information with ecosystem specific dict """ command = ['cloc', '--json', source_path] status, output, error = self._run_analyzer(command) if status != 0: # Let the whole task fail raise RuntimeError("Running cloc command failed: '%s'" % error) # cloc places generic summary here, we will maintain it in top level so # remove misleading key header = { 'total_files': output['header'].pop('n_files'), 'total_lines': output['header'].pop('n_lines') } output.pop('header') if 'SUM' in output: header['blank_lines'] = output['SUM']['blank'] header['comment_lines'] = output['SUM']['comment'] header['code_lines'] = output['SUM']['code'] output.pop('SUM', None) # rename to be more precise with naming wanted_keys = (('blank', 'blank_lines'), ('code', 'code_lines'), ('comment', 'comment_lines'), ('nFiles', 'files_count')) for key in output.keys(): # filter only language-specific results, leave statistics untouched if isinstance(output[key], dict): output[key] = DataNormalizer.transform_keys( output[key], wanted_keys) return header, output @staticmethod def _normalize_complexity_report_output(output, source_path): """Normalize complexity_report output. See https://github.com/escomplex/escomplex/blob/master/README.md#metrics :param output: output dict to be normalized :param source_path: path to sources that was used :return: normalized output """ # For metrics meaning see: wanted_keys = (('maintainability', 'project_maintainability'), ('changeCost', 'cost_change'), ('cyclomatic', 'average_cyclomatic_complexity'), ('effort', 'average_halstead_effort'), ('firstOrderDensity', 'first_order_density'), ('loc', 'average_function_lines_of_code'), ('params', 'average_function_parameters_count'), ('reports', 'modules')) output = DataNormalizer.transform_keys(output, wanted_keys) wanted_module_keys = (('maintainability', 'module_maintainability'), ('dependencies', ), ('loc', 'average_function_lines_of_code'), ('path', ), ('params', 'average_function_parameters_count'), ('functions', )) for idx, module in enumerate(output.get('modules', [])): output['modules'][idx] = DataNormalizer.transform_keys( module, wanted_module_keys) source_path_len = len(source_path) + 1 if 'path' in module: output['modules'][idx]['path'] = module['path'][ source_path_len:] for fun_idx, function in enumerate(module.get('functions')): if 'cyclomaticDensity' in function: function['cyclomatic_density'] = function.pop( 'cyclomaticDensity') return output @staticmethod def _normalize_javancss_output(output): """Parse and normalize JavaNCSS ASCII output. :param output: output dict to be normalized :return: normalized output """ output = output.get('javancss', {}) result = {'functions': {}, 'objects': {}, 'packages': {}} # The output of JavaNCSS is an XML, which is parsed using anymarkup. # This can introduce some pitfalls here if there is found exactly one # item of a type. E.g.: # # <functions> # <function>...<function/> # <functions> # # Is parsed as object 'functions' containing *one object* 'function', whereas: # # <functions> # <function>...<function/> # <function>...<function/> # <functions> # # Is parsed as object 'functions' containing a *list of objects* # 'function'. Thus the isinstance(.., list) checks. # Parse functions section if 'functions' in output: functions = output['functions'] wanted_function_keys = (('ccn', 'cyclomatic_complexity'), ('javadocs', ), ('name', )) result['functions']['function'] = [] if 'function' in functions: if not isinstance(functions['function'], list): functions['function'] = [functions['function']] for function in functions['function']: result['functions']['function'].append( DataNormalizer.transform_keys(function, wanted_function_keys)) function_averages = functions.get('function_averages', {}) result['functions'][ 'average_cyclomatic_complexity'] = function_averages.get('ccn') result['functions']['average_javadocs'] = function_averages.get( 'javadocs') # Parse objects section if 'objects' in output: objects = output['objects'] wanted_objects_keys = (('classes', ), ('functions', ), ('name', ), ('javadocs', )) result['objects']['object'] = [] if 'object' in objects: if not isinstance(objects['object'], list): objects['object'] = [objects['object']] for obj in objects['object']: result['objects']['object'].append( DataNormalizer.transform_keys(obj, wanted_objects_keys)) object_averages = objects.get('averages', {}) result['objects']['average_classes'] = object_averages.get( 'classes') result['objects']['average_functions'] = object_averages.get( 'functions') result['objects']['average_javadocs'] = object_averages.get( 'javadocs') # Parse packages section if 'packages' in output: packages = output['packages'] packages_total = packages.get('total', {}) result['packages']['classes'] = packages_total.get('classes') result['packages']['functions'] = packages_total.get('functions') result['packages']['javadoc_lines'] = packages_total.get( 'javadoc_lines') result['packages']['javadocs'] = packages_total.get('javadocs') result['packages']['multi_comment_lines'] = packages_total.get( 'multi_comment_lines') result['packages']['single_comment_lines'] = packages_total.get( 'single_comment_lines') return result def _normalize_mccabe_output(self, output): result = [] for line in output: # NOTE: due to the way print works in python 2 vs python 3, the mccabe under # python 2 returns `(<coords> <name> <complexity>)`, while the python 3 # version returns the same without the brackets coords, func_name, complexity = line.split() result.append({ 'name': func_name.strip("'"), 'complexity': int(complexity.strip(')')) }) return result def complexity_report(self, source_path): """Run complexity_report tool https://www.npmjs.com/package/complexity-report . :param source_path: path to source codes :return: normalized output """ command = ['cr', '--format=json', source_path] status, output, error = self._run_analyzer(command) if status != 0: self.log.warning("Runing complexity report tool failed: %s", error) return {} if output: output = self._normalize_complexity_report_output( output, source_path) return output def javancss(self, source_path): """Run JavaNCSS tool http://www.kclee.de/clemens/java/javancss . :param source_path: path to source codes :return normalized output """ javancss_path = os.path.join(self.configuration.JAVANCSS_PATH, 'bin', 'javancss') command = [javancss_path, '-all', '-xml', source_path] status, output, error = self._run_analyzer(command, json_output=False) if status != 0: self.log.warning("JavaNCSS tool reported some errors: %s", error) if output: output = anymarkup.parse("".join(output)) output = self._normalize_javancss_output(output) return output def python_mccabe(self, source_path): """Run mccabe tool https://pypi.python.org/pypi/mccabe . :param source_path: path to source codes :return: normalized output """ result = {'files': []} # we'll compute total average cyclomatic complexity manually based as # <total complexity>/<total number of functions> command = ['python3', '-m', 'mccabe'] # mccabe has to be run on individual files, doesn't work recursively on directories for root, dirs, files in os.walk(source_path): for f in files: if f.endswith('.py'): to_run = command + [os.path.join(root, f)] status, output, error = self._run_analyzer( to_run, json_output=False) if status != 0: self.log.info( 'Analyzing with Py3 failed, trying to analyze with Py2 ...' ) to_run[0] = 'python2' status, output, error = self._run_analyzer( to_run, json_output=False) if status != 0: self.log.error( 'Failed to analyze with both Py2 and Py3') continue normalized = self._normalize_mccabe_output(output) # compute file average cyclomatic complexity, add numbers # to overall package complexity f_complexity = functools.reduce( lambda x, y: x + y['complexity'], normalized, 0) f_functions = len(normalized) f_acc = round(f_complexity / f_functions, 1) if f_functions > 0 else 0 result['files'].append({ 'name': os.path.join(root, f)[len(source_path):].strip('/'), 'functions': normalized, 'average_cyclomatic_complexity': f_acc }) return result # A table that carries functions that should be called based on language # that was found by cloc, keys has to match keys in cloc output. Each # handler expect one argument - path to the source where sources sit, the # result is a dict. When you write new analyzer handlers, make sure that # there are no key collisions with new ones as results are aggregated under # "metrics" key. # See 'Recognized languages' section at http://cloc.sourceforge.net/ _LANGUAGE_ANALYZER_HANDLERS = { "JavaScript": [ complexity_report, ], "Ruby": [], "Java": [ javancss, ], "Python": [ python_mccabe, ], "Go": [], "Rust": [] } def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) source_path = ObjectCache.get_from_dict(arguments).get_sources() header, language_stats = self._get_generic_result(source_path) for language in language_stats.keys(): for handler in self._LANGUAGE_ANALYZER_HANDLERS.get(language, []): metrics_data = handler(self, source_path) if not metrics_data: continue if 'metrics' not in language_stats[language]: language_stats[language]['metrics'] = {} language_stats[language]['metrics'].update(metrics_data) # we don't want to have possibly unique keys and we want to avoid # enumerating all languages that are supported by cloc - convert a dict # to a list of language-specific entries result = {'languages': []} for language in language_stats.keys(): record = language_stats.get(language) record['language'] = language result['languages'].append(record) return {'summary': header, 'status': 'success', 'details': result}
class DownstreamUsageTask(BaseTask): """Queries Red Hat's internal toolchain for downstream component usage - queries Anitya for downstream package names - uses the package name and component version to query: - Brew for internal SRPM and build details - the Pulp CDN for redistribution details """ _analysis_name = 'redhat_downstream' schema_ref = SchemaRef(_analysis_name, '2-2-1') _backend_to_anitya_ecosystem = { EcosystemBackend.npm: 'npm', EcosystemBackend.maven: 'maven', EcosystemBackend.pypi: 'pypi', EcosystemBackend.rubygems: 'rubygems', EcosystemBackend.nuget: 'nuget' } _ecosystem_to_prefix = { 'npm': 'nodejs', 'pypi': 'python', 'rubygems': 'rubygem' } # Give CLI 10 minutes to retrieve results _BREWUTILS_CLI_TIMEOUT = 600 def _get_artifact_hash(self, algorithm=None): wr = self.parent_task_result('digests') if wr: for details in wr['details']: if details.get('artifact'): return details[algorithm or 'md5'] return None @staticmethod def _prefix_package_name(name, ecosystem): prefix = DownstreamUsageTask._ecosystem_to_prefix.get(ecosystem, '') if prefix: return '{p}-{n}'.format(p=prefix, n=name) return name def _fetch_anitya_project(self, ecosystem, package): eco_model = self.storage.get_ecosystem(ecosystem) backend = self._backend_to_anitya_ecosystem.get(eco_model.backend, None) if backend is None: raise ValueError('Don\'t know how to add ecosystem {e} with backend {b} to Anitya'. format(e=ecosystem, b=eco_model.backend)) api_path = '/api/by_ecosystem/{e}/{p}/'.format(e=ecosystem, p=package) anitya_url = self.configuration.ANITYA_URL try: return _query_anitya_url(anitya_url, api_path) except (requests.HTTPError, requests.ConnectionError): msg = 'Failed to contact Anitya server at {}' self.log.exception(msg.format(self.configuration.ANITYA_URL)) return None def _get_cdn_metadata(self, srpm_filename): """Try to retrieve Pulp CDN metadata""" try: pulp = Pulp() except ValueError as e: self.log.error(e) return None try: metadata = pulp.get_cdn_metadata_for_srpm(srpm_filename) except Exception as e: self.log.exception(e) return None return metadata def _add_mvn_results(self, result_summary, anitya_mvn_names, version): def _compare_version(downstream, upstream): dv = downstream if 'redhat' in dv: # remove ".redhat-X" or "-redhat-X" suffix dv = dv[:dv.find('redhat') - 1] if dv == upstream: return True else: return False downstream_rebuilds = [] for name in anitya_mvn_names: ga = MavenCoordinates.from_str(name).to_repo_url(ga_only=True) metadata_url = '{repo}/{pkg}/maven-metadata.xml'.format(repo=RH_MVN_GA_REPO, pkg=ga) res = requests.get(metadata_url) if res.status_code != 200: self.log.info('Metadata for package {pkg} not found in {repo} (status {code})'. format(pkg=name, repo=RH_MVN_GA_REPO, code=res.status_code)) continue versions = anymarkup.parse(res.text)['metadata']['versioning']['versions']['version'] # make sure 'versions' is a list (it's a string if there is just one version) if not isinstance(versions, list): versions = [versions] self.log.info('Found versions {v} for package {p}'.format(v=versions, p=name)) for v in versions: if _compare_version(v, version): downstream_rebuilds.append(v) result_summary['rh_mvn_matched_versions'] = downstream_rebuilds if downstream_rebuilds: # For now, we don't distinguish products, we just use general "Middleware" # for all Maven artifacts result_summary['all_rhsm_product_names'].append('Middleware') @staticmethod def _is_inside_rh(): """Returns True if running on RH network, False otherwise.""" is_inside = False try: is_inside = int(os.environ.get("OPENSHIFT_DEPLOYMENT", 0)) == 0 except ValueError: pass return is_inside def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] tool_responses = {} result_summary = { 'package_names': [], 'registered_srpms': [], 'all_rhn_channels': [], 'all_rhsm_content_sets': [], 'all_rhsm_product_names': [] } result_data = {'status': 'error', 'summary': result_summary, 'details': tool_responses } # bail out early; we need access to internal services or the package is # from Maven ecosystem, otherwise we can't comment on downstream usage is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven) if not self._is_inside_rh() and not is_maven: return result_data self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg)) res = self._fetch_anitya_project(eco, pkg) anitya_rpm_names = [] anitya_mvn_names = [] if res is None: result_data['status'] = 'error' elif res.status_code == 200: self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg)) anitya_response = res.json() tool_responses['redhat_anitya'] = anitya_response # For now, we assume all downstreams are ones we care about for entry in anitya_response['packages']: if entry['distro'] == RH_RPM_DISTRO_NAME: anitya_rpm_names.append(entry['package_name']) elif entry['distro'] == RH_MVN_DISTRO_NAME: anitya_mvn_names.append(entry['package_name']) else: self.log.warning( 'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'. format(d=entry['distro'], o=entry['package_name'], p=pkg) ) self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names)) self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names)) # TODO: Report 'partial' here and switch to 'success' at the end result_data['status'] = 'success' else: msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}' self.log.error(msg.format(e=eco, p=pkg, r=res.text)) result_data['status'] = 'error' if self._is_inside_rh(): # we have candidate downstream name mappings, check them against Brew seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)] self.log.debug('Checking candidate names in Brew: {}'.format(seed_names)) args = ['brew-utils-cli', '--version', arguments['version']] artifact_hash = self._get_artifact_hash(algorithm='sha256') if artifact_hash: args += ['--digest', artifact_hash] args += seed_names self.log.debug("Executing command, timeout={timeout}: {cmd}".format( timeout=self._BREWUTILS_CLI_TIMEOUT, cmd=args)) tc = TimedCommand(args) status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT) self.log.debug("status = %s, error = %s", status, error) output = ''.join(output) self.log.debug("output = %s", output) if not output: raise TaskError("Error running command %s" % args) brew = json.loads(output) result_summary['package_names'] = brew['packages'] result_summary['registered_srpms'] = brew['response']['registered_srpms'] tool_responses['brew'] = brew['response']['brew'] # we have SRPM details, fetch details on where the RPMs are shipped tool_responses['pulp_cdn'] = pulp_responses = [] rhn_channels = set() rhsm_content_sets = set() rhsm_product_names = set() for srpm_summary in result_summary['registered_srpms']: srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'], v=srpm_summary['version'], r=srpm_summary['release']) cdn_metadata = self._get_cdn_metadata(srpm_filename) if cdn_metadata is None: msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}' self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename)) continue pulp_responses.append(cdn_metadata) srpm_summary['published_in'] = cdn_metadata['rhsm_product_names'] rhn_channels.update(cdn_metadata['rhn_channels']) rhsm_content_sets.update(cdn_metadata['rhsm_content_sets']) rhsm_product_names.update(cdn_metadata['rhsm_product_names']) result_summary['all_rhn_channels'] = sorted(rhn_channels) result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets) result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names) self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version']) return result_data
class CVEcheckerTask(BaseTask): """Security issues scanner.""" _analysis_name = 'security_issues' schema_ref = SchemaRef(_analysis_name, '3-0-1') dependency_check_jvm_mem_limit = '-Xmx768m' @staticmethod def _parse_severity_and_score(input_tag): """Parse BeatifulSoup tag and return CVE's score and severity from it.""" score, severity = input_tag.text.strip().split() return float(score), severity.lower() @staticmethod def _parse_vector(input_tag): """Parse BeatifulSoup tag and return CVE vector from it.""" vector, *_, = input_tag.text.split() return vector.strip().lstrip('(').rstrip(')') @staticmethod def get_cve_impact(cve_id): """Get more details about cve_id from NVD.""" # TODO: reduce cyclomatic complexity score = 0 vector = '' severity = '' if cve_id: url = "https://nvd.nist.gov/vuln/detail/{cve_id}".format( cve_id=cve_id) response = requests.get(url) if not response.status_code == 200: raise IOError('Unable to reach URL: {url}'.format(url=url)) score_v3 = score_v2 = 0 severity_v3 = severity_v2 = vector_v3 = vector_v2 = '' page = BeautifulSoup(response.text, 'html.parser') for tag in page.find_all(): if tag.attrs.get( 'data-testid') == 'vuln-cvssv3-base-score-link': score_v3, severity_v3 = CVEcheckerTask._parse_severity_and_score( tag) elif tag.attrs.get('data-testid') == 'vuln-cvssv3-vector': # I am prefixing CVSS:3.0 to preserve compatibility vector_v3 = "CVSS:3.0/{}".format( CVEcheckerTask._parse_vector(tag)) elif tag.attrs.get( 'data-testid') == 'vuln-cvssv2-base-score-link': score_v2, severity_v2 = CVEcheckerTask._parse_severity_and_score( tag) elif tag.attrs.get('data-testid') == 'vuln-cvssv2-vector': vector_v2 = CVEcheckerTask._parse_vector(tag) # Prefer CVSS v3.0 over v2 score = score_v3 or score_v2 severity = severity_v3 or severity_v2 vector = vector_v3 or vector_v2 return score, vector, severity @staticmethod def _filter_ossindex_fields(entry): """Create a result record for ossindex entry.""" score, vector, severity = CVEcheckerTask.get_cve_impact( entry.get('cve')) result = { 'id': entry.get('cve') or entry.get('title'), 'description': entry.get('description'), 'references': entry.get('references'), 'cvss': { 'score': score, 'vector': vector }, 'severity': severity } return result @staticmethod def _filter_victims_db_entry(entry): """Create a result record for ossindex entry.""" if 'cve' not in entry: return None _, vector, severity = CVEcheckerTask.get_cve_impact(entry.get('cve')) result = { 'id': 'CVE-' + entry['cve'], 'description': entry.get('description'), 'references': entry.get('references'), 'cvss': { 'score': entry.get('cvss_v3') or entry.get('cvss_v2'), 'vector': vector }, 'severity': severity, 'attribution': "https://github.com/victims/victims-cve-db, CC BY-SA 4.0, modified" } return result @staticmethod def query_url(url): """Query url and return json.""" response = requests.get(url) response.raise_for_status() return response.json() @staticmethod def _query_ossindex_package(ecosystem, name): """Get vulnerabilities for a given package ecosystem:name from OSSIndex.""" url = "https://ossindex.net/v2.0/package/{pm}/{package}".format( pm=ecosystem, package=name) return CVEcheckerTask.query_url(url) @staticmethod def query_ossindex_vulnerability_fromtill(ecosystem, from_time=0, till_time=-1): """From OSSIndex get vulnerabilities which changed between from_time and till_time.""" # OSS Index uses timestamp in milliseconds from_time = int(from_time * 1000) till_time = int(till_time * 1000) url = "https://ossindex.net/v2.0/vulnerability/pm/{pm}/fromtill/{from_time}/{till_time}".\ format(pm=ecosystem, from_time=from_time, till_time=till_time) packages = [] while url: response = CVEcheckerTask.query_url(url) for package in response.get('packages', []): for vulnerability in package.get('vulnerabilities', []): # Sanity check: # the response always contains at least one entry, even if it should be empty # (when 'from_time' is higher than 'updated' time of all entries in db) if int(vulnerability.get('updated')) < from_time: package['vulnerabilities'].remove(vulnerability) if package.get('vulnerabilities', []): packages.append(package) url = response.get('next') return packages def _query_ossindex(self, arguments): """Query OSS Index REST API.""" entries = {} solver = get_ecosystem_solver(self.storage.get_ecosystem( arguments['ecosystem']), with_parser=OSSIndexDependencyParser()) for package in self._query_ossindex_package(arguments['ecosystem'], arguments['name']): for vulnerability in package.get('vulnerabilities', []): for version_string in vulnerability.get('versions', []): try: affected_versions = solver.solve([ "{} {}".format(arguments['name'], version_string) ], all_versions=True) except Exception: self.log.exception("Failed to resolve %r for %s:%s", version_string, arguments['ecosystem'], arguments['name']) continue if arguments['version'] in affected_versions.get( arguments['name'], []): entry = self._filter_ossindex_fields(vulnerability) if entry.get('id'): entries[entry['id']] = entry return { 'summary': list(entries.keys()), 'status': 'success', 'details': list(entries.values()) } @staticmethod def update_victims_cve_db_on_s3(): """Update Victims CVE DB on S3.""" with VictimsDB.build_from_git() as db: db.store_on_s3() def _query_victims(self, arguments, ecosystem): """Check EPV with VictimsDB.""" db = None try: db = VictimsDB.from_s3() if not db: self.log.debug( 'No Victims CVE DB found on S3, cloning from github') db = VictimsDB.build_from_git() db.store_on_s3() return db.get_vulnerabilities_for_epv(ecosystem, arguments['name'], arguments['version']) finally: if db: db.close() def _victims_scan(self, arguments, ecosystem): """Run Victims CVE DB CLI.""" results = {'summary': [], 'status': 'success', 'details': []} victims_cve_db_results = self._query_victims(arguments, ecosystem) for vulnerability in victims_cve_db_results: vulnerability = self._filter_victims_db_entry(vulnerability) if not vulnerability: continue if vulnerability['id'] not in results['summary']: results['summary'].append(vulnerability['id']) results['details'].append(vulnerability) return results def _nuget_scan(self, arguments): """Get vulnerabilities info about given nuget package.""" return self._query_ossindex(arguments) def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) rdb = StoragePool.get_connected_storage('BayesianPostgres') ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem')) if arguments['ecosystem'] in ('maven', 'pypi', 'npm'): return self._victims_scan(arguments, ecosystem) elif arguments['ecosystem'] == 'nuget': return self._nuget_scan(arguments) else: raise RequestError('Unsupported ecosystem')
class GithubTask(BaseTask): """ Collects statistics using Github API """ _analysis_name = "github_details" schema_ref = SchemaRef(_analysis_name, '1-0-4') # used for testing _repo_name = None _repo_url = None @classmethod def create_test_instance(cls, repo_name, repo_url): instance = super().create_test_instance() # set for testing as we are not querying DB for mercator results instance._repo_name = repo_name instance._repo_url = repo_url return instance @staticmethod def _retry_no_cached(call, sleep_time=2, retry_count=10): """ Deal with cached results from GitHub as PyGitHub does not check this https://developer.github.com/v3/repos/statistics/#a-word-about-caching """ result = None for _ in range(retry_count): result = call() if result: break time.sleep(sleep_time) return result @classmethod def _get_last_years_commits(cls, repo): activity = cls._retry_no_cached(repo.get_stats_commit_activity) if not activity: return [] return [x.total for x in activity] @staticmethod def _rate_limit_exceeded(gh): return gh.rate_limiting[0] == 0 @classmethod def _get_repo_stats(cls, repo): # len(list()) is workaround for totalCount being None # https://github.com/PyGithub/PyGithub/issues/415 contributors = cls._retry_no_cached(repo.get_contributors) d = { 'contributors_count': len(list(contributors)) if contributors is not None else 'N/A' } for prop in REPO_PROPS: d[prop] = repo.raw_data.get(prop, -1) return d def _get_repo_name(self, url): """Retrieve GitHub repo from a preceding Mercator scan""" parsed = parse_gh_repo(url) if not parsed: self.log.debug('Could not parse Github repo URL %s', url) else: self._repo_url = 'https://github.com/' + parsed return parsed def _get_topics(self): if not self._repo_url: return [] pop = requests.get('{url}'.format(url=self._repo_url)) poppage = BeautifulSoup(pop.text, 'html.parser') topics = [] for link in poppage.find_all("a", class_="topic-tag"): topics.append(link.text.strip()) return topics def execute(self, arguments): result_data = {'status': 'unknown', 'summary': [], 'details': {}} # For testing purposes, a repo may be specified at task creation time if self._repo_name is None: # Otherwise, get the repo name from earlier Mercator scan results self._repo_name = self._get_repo_name(arguments['url']) if self._repo_name is None: # Not a GitHub hosted project return result_data token = self.configuration.GITHUB_TOKEN if not token: if self._rate_limit_exceeded(github.Github()): self.log.error( "No Github API token provided (GITHUB_TOKEN env variable), " "and rate limit exceeded! " "Ending now to not wait endlessly") result_data['status'] = 'error' return result_data else: self.log.warning( "No Github API token provided (GITHUB_TOKEN env variable), " "requests will be unauthenticated, " "i.e. limited to 60 per hour") else: # there might be more comma-separated tokens, randomly select one token = random.choice(token.split(',')).strip() gh = github.Github(login_or_token=token) try: repo = gh.get_repo(full_name_or_id=self._repo_name, lazy=False) except github.GithubException: self.log.error("Failed to get repo %s" % self._repo_name) result_data['status'] = 'error' return result_data result_data['status'] = 'success' issues = {} # Get Repo Statistics notoriety = self._get_repo_stats(repo) if notoriety: issues.update(notoriety) issues['topics'] = self._get_topics() # Get Commit Statistics last_year_commits = self._get_last_years_commits(repo) commits = { 'last_year_commits': { 'sum': sum(last_year_commits), 'weekly': last_year_commits } } issues.update(commits) result_data['details'] = issues return result_data
class StackAnalysesById(ResourceWithSchema): schema_ref = SchemaRef('stack_analyses', '2-1-3') def get(self, external_request_id): manifest_appstackid_map = {} try: results = rdb.session.query(StackAnalysisRequest)\ .filter(StackAnalysisRequest.id == external_request_id) if results.count() <= 0: raise HTTPError(404, "Invalid request ID '{id}' received".format(id=external_request_id)) row = results.first().to_dict() submitted_at = row["submitTime"] request_json = json.loads(row["requestJson"]) for manifest in request_json["manifest"]: if manifest.get('appstack_id', 0): manifest_appstackid_map[manifest["filename"]] = manifest["appstack_id"] except SQLAlchemyError: raise HTTPError(500, "Error fetching data for request ID '{id}'".format(id=external_request_id)) try: results = rdb.session.query(WorkerResult)\ .filter(WorkerResult.external_request_id == external_request_id, WorkerResult.worker == "dependency_aggregator") if results.count() <= 0: raise HTTPError(202, "Analysis for request ID '{t}' is in progress".format(t=external_request_id)) except SQLAlchemyError: raise HTTPError(500, "Worker result for request ID '{t}' doesn't exist yet".format(t=external_request_id)) try: if results.count() > 0: result = results.first().to_dict() audit = result["task_result"]["_audit"] manifest_response = [] # TODO: this will probably need some refactoring for manifest in result["task_result"]["result"]: for component in manifest["components"]: component["latest_version"] = safe_get_latest_version(component["ecosystem"], component["name"]) component["dependents_count"] = get_dependents_count(component["ecosystem"], component["name"], component["version"], rdb.session) rank = get_component_percentile_rank( component["ecosystem"], component["name"], component["version"], rdb.session ) component["relative_usage"] = usage_rank2str(rank) manifest_appstack_id = manifest_appstackid_map.get(manifest["manifest_name"], '') if manifest_appstack_id != '': url = current_app.config['BAYESIAN_ANALYTICS_URL'] endpoint = "{analytics_baseurl}/api/v1.0/recommendation/{appstack_id}"\ .format(analytics_baseurl=url, appstack_id=manifest_appstack_id) resp = requests.get(endpoint) if resp.status_code == 200: recommendation = resp.json() # Adding URI of the stacks to the recommendation if recommendation.get("input_stack", {}).get("appstack_id", "") != "": uri = "{analytics_baseurl}/api/v1.0/appstack/{appstack_id}"\ .format(analytics_baseurl=url, appstack_id=recommendation["input_stack"]["appstack_id"]) recommendation["input_stack"]["uri"] = uri if recommendation.get("recommendations", {}).get("similar_stacks", "") != "": for r in recommendation["recommendations"]["similar_stacks"]: if r["stack_id"] != "": r["uri"] = "{analytics_baseurl}/api/v1.0/appstack/{appstack_id}"\ .format(analytics_baseurl=url, appstack_id=r["stack_id"]) manifest["recommendation"] = recommendation else: current_app.logger.warn("{status}: {error}".format(status=resp.status_code, error=resp.content)) manifest_response.append(manifest) response = { "status": result["task_result"]["status"], "submitted_at": submitted_at, "started_at": audit["started_at"], "finished_at": audit["ended_at"], "request_id": result["external_request_id"], "result": manifest_response } return response except: raise HTTPError(500, "Error creating response for request {t}".format(t=external_request_id))
def test_next_model(self): schema_ref = SchemaRef("example", "1-0-0") assert schema_ref.next_model() == SchemaRef("example", "2-0-0")
def test_next_revision(self): schema_ref = SchemaRef("example", "1-0-0") assert schema_ref.next_revision() == SchemaRef("example", "1-1-0")
def test_next_addition(self): schema_ref = SchemaRef("example", "1-0-0") assert schema_ref.next_addition() == SchemaRef("example", "1-0-1")
class LicenseCheckTask(BaseTask): """Check licences of all files of a package.""" _analysis_name = 'source_licenses' schema_ref = SchemaRef(_analysis_name, '3-0-0') @staticmethod def process_output(data): # not interested in these keys_to_remove = [ 'start_line', 'end_line', 'matched_rule', 'score', 'key' ] # 'files' is a list of file paths along with info about detected licenses. # If there's the same license text in most files, then almost the same license info # accompanies each file path. # Therefore transform it into dict of licenses (keys) along with info about the license plus # paths of files where the license has been detected. licenses = {} for file in data.pop('files'): for _license in file['licenses']: # short_name becomes key short_name = _license.pop('short_name') if short_name not in licenses.keys(): for key in keys_to_remove: del _license[key] _license['paths'] = {file['path']} licenses[short_name] = _license else: licenses[short_name]['paths'].add(file['path']) for l in licenses.values(): l['paths'] = list(l['paths']) # set -> list data['licenses'] = licenses del data['scancode_options'] return data @staticmethod def run_scancode(scan_path): result_data = {'status': 'unknown', 'summary': {}, 'details': {}} command = [ path.join(configuration.SCANCODE_PATH, 'scancode'), # Scan for licenses '--license', # Do not return license matches with scores lower than this score '--license-score', configuration.SCANCODE_LICENSE_SCORE, # Files without findings are omitted '--only-findings', # Use n parallel processes '--processes', configuration.SCANCODE_PROCESSES, # Do not print summary or progress messages '--quiet', # Strip the root directory segment of all paths '--strip-root', # Stop scanning a file if scanning takes longer than a timeout in seconds '--timeout', configuration.SCANCODE_TIMEOUT, scan_path ] for ignore_pattern in configuration.SCANCODE_IGNORE: command += ['--ignore', '{}'.format(ignore_pattern)] with username(): tc = TimedCommand(command) status, output, error = tc.run(is_json=True, timeout=1200) if status != 0: raise FatalTaskError( "Error (%s) during running command %s: %r" % (str(status), command, error)) details = LicenseCheckTask.process_output(output) result_data['details'] = details result_data['status'] = 'success' result_data['summary'] = { 'sure_licenses': list(details['licenses'].keys()) } return result_data def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] ver = arguments['version'] try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: if arguments['ecosystem'] != 'maven': self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = self.run_scancode(cache_path) return result_data
class BlackDuckHub(object): """ Hub provides access around Black Duck Hub APIs """ # The authentication token is returned in a cookie with this name COOKIE_NAME = 'JSESSIONID' def __init__(self, url): self._url = url self._session = None @property def url(self): """ URL of the Hub with trailing slash, example `https://hub.blackducksoftware.com/` """ return self._url def _api(self, param): """ Format a new API call, checks session validity as well :param param: str, parameters to append to base url :return: str, formatted API call """ return "{}{}".format(self.url, param) def _api_get(self, param): """ Perform a get request against the API using local `_session` :param param: str, full request URL :return: requests.Request, a request object """ return get(self._api(param), cookies={self.COOKIE_NAME: self._session.api_token.token}, verify=False) def connect_session(self, username, password): """ Establishes a new session with the HUB using the provided credentials :param username: str :param password: str :return: BlackDuckSession, a session object :raises: BlackDuckSessionException """ req = post(self._api("j_spring_security_check"), data={ 'j_username': username, 'j_password': password }, verify=False) if req.status_code != 204: raise BlackDuckSessionException("Black Duck authentication error") token = req.cookies.get(self.COOKIE_NAME) self._session = BlackDuckSession(BlackDuckApiToken(token)) return self._session @needs_session def find_project(self, name): """ Find a Project by Name :param name: str, name of the project :return: BlackDuckProject, found project or `None` :raises: BlackDuckSessionException """ preq = self._api_get('api/v1/projects?name=' + name) if preq.status_code == 200: pdata = preq.json() return BlackDuckProject(pdata) else: return None @needs_session @schema.result(SchemaRef("blackduck-project-list", "1-0-0")) def _list_projects_json(self): req = self._api_get('api/projects/') if req.status_code == 200: return req.json() else: raise BlackDuckException('Unable to list projects') def list_projects(self): """ Lists all projects valid for the current session :return: List[BlackDuckProject], list of projects :raises: BlackDuckException, BlackDuckSessionException """ names = [project['name'] for project in self._list_projects_json()] projects = [] for name in names: projects.append(self.find_project(name)) return projects @needs_session def get_releases(self, project_id): """ Get all releases of the given project :param project_id: BlackDuckProject or str, project reference or ID :return: Dict[str, BlackDuckRelease], a map of version strings to release objects :raises: BlackDuckException, BlackDuckSessionException """ if isinstance(project_id, BlackDuckProject): project_id = project_id.id req = self._api_get('api/v1/projects/{id}/version-summaries'.format(id=project_id)) if req.status_code == 200: data = req.json() return {obj['version']: BlackDuckRelease(obj, project_id) for obj in data['items']} else: raise BlackDuckException('Unable to fetch releases for ' + project_id) @needs_session @schema.result(SchemaRef("blackduck-vulnerable-bom", "1-0-0")) def get_release_bom_json(self, release_id): """ Get the Bill of Materials for specific release :param release_id: BlackDuckRelease or str, release reference or ID :return: dict, the BOM JSON as a dictionary :raises: BlackDuckException, BlackDuckSessionException """ release = release_id if isinstance(release_id, BlackDuckRelease): release_id = release_id.id req = self._api_get('api/projects/{p}/versions/{i}/vulnerable-bom-components'.format( i=release_id, p=release.project)) if req.status_code == 200: return req.json() else: raise BlackDuckException('Unable to fetch release information ' + release_id + " " + release.project) @needs_session def get_release_code_locations(self, release_id): """ Get code locations for given release :param release_id: BlackDuckRelease or str, release reference or ID :return: dict, response json containing the retrieved code locations list :raises: BlackDuckException, BlackDuckSessionException """ release = release_id if isinstance(release_id, BlackDuckRelease): release_id = release_id.id req = self._api_get('api/projects/{p}/versions/{i}/codelocations'.format(i=release_id, p=release.project)) if req.status_code == 200: return req.json() else: raise BlackDuckException('Unable to fetch code locations for {relid} {relproj}'. format(relid=release_id, relproj=release.project)) @needs_session def get_code_location_scan_summary(self, location_id): """ Get scan summary for given code location ID :param location_id: str :return: dict, the code location :raises: BlackDuckException, BlackDuckSessionException """ req = self._api_get('api/codelocations/{locid}/scan-summaries'.format(locid=location_id)) if req.status_code == 200: return req.json() else: raise BlackDuckException('Unable to fetch scan summary for code location {locid}'. format(locid=location_id))