def get_response(url, headers=None, sleep_time=2, retry_count=10): """Wrap requests which tries to get response. :param url: URL where to do the request :param headers: additional headers for request :param sleep_time: sleep time between retries :param retry_count: number of retries :return: content of response's json """ try: for _ in range(retry_count): response = requests.get(url, headers=headers) response.raise_for_status() if response.status_code == 204: # json() below would otherwise fail with JSONDecodeError raise HTTPError('No content') response = response.json() if response: return response time.sleep(sleep_time) else: raise TaskError("Number of retries exceeded") except HTTPError as err: message = "Failed to get results from {url} with {err}".format(url=url, err=err) logger.error(message) raise TaskError(message) from err
def create_repo_node_and_get_cve(self, github_repo, deps_list): """Create a repository node in the graphdb and create its edges to all deps. :param github_repo: :param dependencies: :return: {}, gremlin_response """ gremlin_str = ( "repo=g.V().has('repo_url', '{repo_url}').tryNext().orElseGet{{" "graph.addVertex('vertex_label', 'Repo', 'repo_url', '{repo_url}')}};" "g.V(repo).outE('has_dependency').drop().iterate();" "g.V(repo).outE('has_transitive_dependency').drop().iterate();". format(repo_url=github_repo)) # Create an edge between repo -> direct dependencies for pkg in deps_list.get('direct'): ecosystem = pkg.split(':')[0] version = pkg.split(':')[-1] name = pkg.replace(ecosystem + ':', '').replace(':' + version, '') gremlin_str += ( "ver=g.V().has('pecosystem', '{ecosystem}').has('pname', '{name}')." "has('version', '{version}');ver.hasNext() && " "g.V(repo).next().addEdge('has_dependency', ver.next());". format(ecosystem=ecosystem, name=name, version=version)) # Create an edge between repo -> transitive dependencies for pkg in deps_list.get('transitive'): ecosystem = pkg.split(':')[0] version = pkg.split(':')[-1] name = pkg.replace(ecosystem + ':', '').replace(':' + version, '') gremlin_str += ( "ver=g.V().has('pecosystem', '{ecosystem}').has('pname', '{name}')." "has('version', '{version}');ver.hasNext() && " "g.V(repo).next().addEdge('has_transitive_dependency', ver.next());" .format(ecosystem=ecosystem, name=name, version=version)) # Traverse the Repo to Direct/Transitive dependencies that have CVE's and report them gremlin_str += ( "g.V(repo).as('rp').outE('has_dependency','has_transitive_dependency')" ".as('ed').inV().as('epv').select('rp','ed','epv').by(valueMap(true));" ) payload = {"gremlin": gremlin_str} try: rawresp = requests.post(url=GREMLIN_SERVER_URL_REST, json=payload) resp = rawresp.json() self.log.info('######## Gremlin Response %r' % resp) if rawresp.status_code != 200: raise TaskError( "Error creating repository node for {repo_url} - " "{resp}".format(repo_url=github_repo, resp=resp)) except Exception: self.log.error(traceback.format_exc()) raise TaskError( "Error creating repository node for {repo_url}".format( repo_url=github_repo)) return None return resp
def _find_blackduck_cli_root(self): """ Find the base directory where the BlackDuck CLI got extracted :return: str, path to the CLI root """ base = self.configuration.BLACKDUCK_PATH dirs = listdir(base) if not dirs: raise TaskError("Unable to find BlackDuck CLI directory") if len(dirs) > 1: raise TaskError("More than 1 BlackDuck CLI directory") return path.join(base, dirs.pop())
def clone(cls, url, path, depth=None, branch=None, single_branch=False): """ clone repository provided as url to specific path :param url: str :param path: str :param depth: str :param branch: str :return: instance of Git() """ orig_url = url cls.config() # git clone doesn't understand urls starting with: git+ssh, git+http, git+https url = url2git_repo(url) cmd = ["git", "clone", url, path] if depth is not None: cmd.extend(["--depth", depth]) if branch is not None: cmd.extend(["--branch", branch]) if single_branch: cmd.extend(["--single-branch"]) try: TimedCommand.get_command_output(cmd, graceful=False) except TaskError as exc: raise TaskError("Unable to clone: %s" % orig_url) from exc return cls(path=path)
def get_command_output(args, graceful=True, is_json=False, **kwargs): """Improved version of subprocess.check_output. :param graceful: bool, if False, raise Exception when command fails :param is_json: bool, if True, return decoded json :return: list of strings, output which command emitted """ logger.debug("running command %s", args) try: # Using universal_newlines mostly for the side-effect of decoding # the output as UTF-8 text on Python 3.x out = check_output(args, universal_newlines=True, **kwargs) except (CalledProcessError, TimeoutExpired) as ex: # TODO: we may want to use subprocess.Popen to be able to also print stderr here # (while not mixing it with stdout that is returned if the subprocess succeeds) if isinstance(ex, TimeoutExpired): logger.warning("command %s timed out:\n%s", args, ex.output) else: logger.warning("command %s ended with %s\n%s", args, ex.returncode, ex.output) if not graceful: logger.error("exception is fatal") raise TaskError("Error during running command %s: %r" % (args, ex.output)) else: logger.debug("Ignoring because graceful flag is set") return [] else: if is_json: # FIXME: some error handling here would be great return json.loads(out) else: return [f for f in out.split('\n') if f] # py2 & 3 compat
def extract_dependencies(github_repo, github_sha): """Extract the dependencies information. Currently assuming repository is maven/npm/python repository. :param github_repo: repository url :param github_sha: commit hash :return: set of direct (and indirect) dependencies """ with TemporaryDirectory() as workdir: repo = Git.clone(url=github_repo, path=workdir, timeout=3600) repo.reset(revision=github_sha, hard=True) with cwd(repo.repo_path): # TODO: Make this task also work for files not present in root directory. # First change the package-lock.json to npm-shrinkwrap.json GithubDependencyTreeTask.change_package_lock_to_shrinkwrap() if peek(Path.cwd().glob("pom.xml")): return GithubDependencyTreeTask.get_maven_dependencies() elif peek(Path.cwd().glob("npm-shrinkwrap.json")) \ or peek(Path.cwd().glob("package.json")): return GithubDependencyTreeTask.get_npm_dependencies( repo.repo_path) elif peek(Path.cwd().glob("requirements.txt")): return GithubDependencyTreeTask.get_python_dependencies( repo.repo_path) elif peek(Path.cwd().glob("glide.lock")): return GithubDependencyTreeTask.get_go_glide_dependencies( repo.repo_path) elif peek(Path.cwd().glob("Gopkg.lock")): return GithubDependencyTreeTask.get_go_pkg_dependencies() else: raise TaskError("Please provide maven or npm or " "python or Go repository for scanning!")
def execute(self, arguments=None): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.info("Arguments passed from flow: {}".format(arguments)) self._strict_assert(arguments.get('service_token')) github_repo = arguments.get('github_repo').strip() dependencies = [] repo_cves = [] if len(arguments.get('epv_list', [])): # self._strict_assert(arguments.get('epv_list')) for epv in arguments.get('epv_list'): dependencies.append('{ecosystem}:{package}:{version}'.format( ecosystem=epv.get('ecosystem'), package=epv.get('name'), version=epv.get('version'))) self.log.info('######## Dependencies list: %r' % dependencies) try: repo_cves = self.get_cve(dependencies) except TaskError: raise TaskError('Failed to get CVEs') else: dependencies = GithubDependencyTreeTask.extract_dependencies( github_repo=github_repo, user_flow=True) self.log.info('######## Deps list %r' % dependencies) try: # forward only the available dependencies in the system. Unknown # dependencies are not going to be ingested for osioUserNotificationFlow. repo_cves = self.create_repo_node_and_get_cve( github_repo, dependencies) self.log.info('######## repo_cves %r' % repo_cves) except TaskError: raise TaskError('Failed to Create Repo Node') report = self.generate_report(repo_cves=repo_cves, deps_list=dependencies) return { 'report': report, 'service_token': arguments['service_token'], 'dependencies': dependencies }
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) eco = arguments['ecosystem'] pkg = arguments['name'] homepage = self._get_project_homepage(eco, pkg) self.log.info('Registering project {e}/{p} to Anitya'.format(e=eco, p=pkg)) res = self._create_anitya_project(eco, pkg, homepage) if res.status_code == 200: self.log.info( 'Project {e}/{p} had already been registered to Anitya'.format( e=eco, p=pkg)) elif res.status_code == 201: self.log.info( 'Project {e}/{p} was successfully registered to Anitya'.format( e=eco, p=pkg)) else: self.log.error( 'Failed to create Anitya project {e}/{p}. Anitya response: {r}' .format(e=eco, p=pkg, r=res.text)) return None # TODO: When we move to a proper workflow manager, we'll want to raise TaskError # here instead of just logging an error. Right now we don't want a problem # in AnityaTask to shut down the rest of analysis phases. # raise TaskError('Failed to create Anitya project {e}/{p}. Anitya response: {r}'. # format(e=eco, p=pkg, r=res.text)) self.log.info('Project {e}/{p} created successfully'.format(e=eco, p=pkg)) self.log.debug('About to add downstream mapping for %s/%s to Anitya' % (eco, pkg)) distro_pkgs = {} distro_pkgs.update([self._get_downstream_rpm_pkgs(eco, pkg)]) if self.storage.get_ecosystem(eco).is_backed_by( EcosystemBackend.maven): distro_pkgs.update([self._get_downstream_mvn_pkgs(eco, pkg)]) for distro, package_names in distro_pkgs.items(): for package_name in package_names: res = self._add_downstream_mapping(eco, pkg, distro, package_name) if res.status_code == 200: self.log.info( 'Downstream mapping %s/%s has already been added to project %s' % (distro, package_name, pkg)) elif res.status_code == 201: self.log.info( 'Downstream mapping %s/%s was added to project %s' % (distro, package_name, pkg)) else: raise TaskError( 'Failed to add downstream mapping %s/%s to project %s' % (distro, package_name, pkg)) # we don't want to save any data, so return None return None
def _resolve_dependency(ecosystem, dep): ret = { 'ecosystem': ecosystem.name, 'declaration': dep, 'resolved_at': json_serial(datetime.datetime.utcnow()) } # first, if this is a Github dependency, return it right away (we don't resolve these yet) if ' ' in dep: # we have both package name and version (version can be an URL) name, spec = dep.split(' ', 1) if gh_dep.match(spec): ret['name'] = name ret['version'] = 'https://github.com/' + spec elif urllib.parse.urlparse(spec).scheme is not '': ret['name'] = name ret['version'] = spec else: if gh_dep.match(dep): ret['name'] = 'https://github.com/' + dep ret['version'] = None elif urllib.parse.urlparse(dep).scheme is not '': ret['name'] = dep ret['version'] = None if 'name' in ret: return ret # second, figure out what is the latest upstream version matching the spec and return it solver = get_ecosystem_solver(ecosystem) pkgspec = solver.solve([dep]) if not pkgspec: raise TaskError("invalid dependency: {}".format(dep)) package, version = pkgspec.popitem() if not version: raise TaskError("could not resolve {}".format(dep)) ret['name'] = package ret['version'] = version return ret
def run_timed_command(cmd, file): """Run timed command and write output to file. :param cmd: command to run :param file: output file :return: """ timed_cmd = TimedCommand(cmd) status, output, _ = timed_cmd.run(timeout=3600) if status != 0 or not file.is_file(): # all errors are in stdout, not stderr raise TaskError(output)
def clone(cls, url, path, timeout=300, depth=None, branch=None, single_branch=False): """Clone repository provided as url to specific path. :param url: str :param path: str :param timeout: int :param depth: str :param branch: str :param single_branch: bool, only checkout single branch :return: instance of Git() """ orig_url = url # git clone doesn't understand urls starting with: git+ssh, git+http, git+https url = url2git_repo(url) orig_path = path path = Path(path) mode = 0 if path.is_dir(): mode = path.stat().st_mode cmd = ["git", "clone", url, orig_path] if depth is not None: cmd.extend(["--depth", depth]) if branch is not None: cmd.extend(["--branch", branch]) if single_branch: cmd.extend(["--single-branch"]) try: cls.config() TimedCommand.get_command_output(cmd, graceful=False, timeout=timeout) except TaskError as exc: if not path.is_dir() and mode: # 'git clone repo dir/' deletes (no way to turn this off) dir/ if cloning fails. # This might confuse caller of this method, so we recreate the dir on error here. try: path.mkdir(mode) except OSError: logger.error("Unable to re-create dir: %s", str(path)) raise TaskError("Unable to clone: %s" % orig_url) from exc return cls(path=orig_path)
def execute(self, arguments): hub = self._get_hub() self.log.info('Determining if data is already available at BD Hub ...') if not self._data_ready(hub, self._get_project_name(arguments), arguments['version']): self.log.info('Data not available yet at BD Hub, retrying ...') raise BlackDuckDataNotReady(self._get_project_name(arguments), arguments['version']) self.log.info('Data is available at BD Hub, extracting ...') data = super().execute(arguments) if not data['details']: raise TaskError("No data from Hub") return data
def _collect_dependencies(self): """Return all dependencies for current analysis flow (operates on parent mercator result). :return: List[str], list of dependencies """ wr = self.parent_task_result('metadata') if not isinstance(wr, dict): raise TaskError('metadata task result has unexpected type: {}; expected dict'. format(type(wr))) # there can be details about multiple manifests in the metadata, # therefore we will collect dependency specifications from all of them # and exclude obvious duplicates along the way dependencies = list({dep for m in wr.get('details', []) if m.get('dependencies') for dep in m.get('dependencies', [])}) return dependencies
def extract_dependencies(github_repo, github_sha): """Extract the dependencies information. Currently assuming repository is maven repository. """ with TemporaryDirectory() as workdir: repo = Git.clone(url=github_repo, path=workdir, timeout=3600) repo.reset(revision=github_sha, hard=True) with cwd(repo.repo_path): output_file = Path.cwd() / "dependency-tree.txt" cmd = ["mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree", "-DoutputType=dot", "-DoutputFile={filename}".format(filename=output_file), "-DappendOutput=true"] timed_cmd = TimedCommand(cmd) status, output, _ = timed_cmd.run(timeout=3600) if status != 0 or not output_file.is_file(): # all errors are in stdout, not stderr raise TaskError(output) with output_file.open() as f: return GithubDependencyTreeTask.parse_maven_dependency_tree(f.readlines())
def get_maven_dependencies(): """Get direct and indirect dependencies from pom.xml by using maven dependency tree plugin. :return: set of direct and indirect dependencies """ output_file = Path.cwd() / "dependency-tree.txt" cmd = [ "mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree", "-DoutputType=dot", "-DoutputFile={filename}".format(filename=output_file), "-DappendOutput=true" ] timed_cmd = TimedCommand(cmd) status, output, _ = timed_cmd.run(timeout=3600) if status != 0 or not output_file.is_file(): # all errors are in stdout, not stderr raise TaskError(output) with output_file.open() as f: return GithubDependencyTreeTask.parse_maven_dependency_tree( f.readlines())
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] tool_responses = {} result_summary = { 'package_names': [], 'registered_srpms': [], 'all_rhn_channels': [], 'all_rhsm_content_sets': [], 'all_rhsm_product_names': [] } result_data = {'status': 'error', 'summary': result_summary, 'details': tool_responses } # bail out early; we need access to internal services or the package is # from Maven ecosystem, otherwise we can't comment on downstream usage is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven) if not self._is_inside_rh() and not is_maven: return result_data self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg)) res = self._fetch_anitya_project(eco, pkg) anitya_rpm_names = [] anitya_mvn_names = [] if res is None: result_data['status'] = 'error' elif res.status_code == 200: self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg)) anitya_response = res.json() tool_responses['redhat_anitya'] = anitya_response # For now, we assume all downstreams are ones we care about for entry in anitya_response['packages']: if entry['distro'] == RH_RPM_DISTRO_NAME: anitya_rpm_names.append(entry['package_name']) elif entry['distro'] == RH_MVN_DISTRO_NAME: anitya_mvn_names.append(entry['package_name']) else: self.log.warning( 'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'. format(d=entry['distro'], o=entry['package_name'], p=pkg) ) self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names)) self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names)) # TODO: Report 'partial' here and switch to 'success' at the end result_data['status'] = 'success' else: msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}' self.log.error(msg.format(e=eco, p=pkg, r=res.text)) result_data['status'] = 'error' if self._is_inside_rh(): # we have candidate downstream name mappings, check them against Brew seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)] self.log.debug('Checking candidate names in Brew: {}'.format(seed_names)) args = ['brew-utils-cli', '--version', arguments['version']] artifact_hash = self._get_artifact_hash(algorithm='sha256') if artifact_hash: args += ['--digest', artifact_hash] args += seed_names self.log.debug("Executing command, timeout={timeout}: {cmd}".format( timeout=self._BREWUTILS_CLI_TIMEOUT, cmd=args)) tc = TimedCommand(args) status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT) self.log.debug("status = %s, error = %s", status, error) output = ''.join(output) self.log.debug("output = %s", output) if not output: raise TaskError("Error running command %s" % args) brew = json.loads(output) result_summary['package_names'] = brew['packages'] result_summary['registered_srpms'] = brew['response']['registered_srpms'] tool_responses['brew'] = brew['response']['brew'] # we have SRPM details, fetch details on where the RPMs are shipped tool_responses['pulp_cdn'] = pulp_responses = [] rhn_channels = set() rhsm_content_sets = set() rhsm_product_names = set() for srpm_summary in result_summary['registered_srpms']: srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'], v=srpm_summary['version'], r=srpm_summary['release']) cdn_metadata = self._get_cdn_metadata(srpm_filename) if cdn_metadata is None: msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}' self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename)) continue pulp_responses.append(cdn_metadata) srpm_summary['published_in'] = cdn_metadata['rhsm_product_names'] rhn_channels.update(cdn_metadata['rhn_channels']) rhsm_content_sets.update(cdn_metadata['rhsm_content_sets']) rhsm_product_names.update(cdn_metadata['rhsm_product_names']) result_summary['all_rhn_channels'] = sorted(rhn_channels) result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets) result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names) self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version']) return result_data