def extract_dependencies(github_repo, github_sha): """Extract the dependencies information. Currently assuming repository is maven repository. """ with TemporaryDirectory() as workdir: repo = Git.clone(url=github_repo, path=workdir, timeout=3600) repo.reset(revision=github_sha, hard=True) with cwd(repo.repo_path): output_file = Path.cwd() / "dependency-tree.txt" cmd = ["mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree", "-DoutputType=dot", "-DoutputFile={filename}".format(filename=output_file), "-DappendOutput=true"] timed_cmd = TimedCommand(cmd) status, output, _ = timed_cmd.run(timeout=3600) if status != 0 or not output_file.is_file(): # all errors are in stdout, not stderr raise TaskError(output) with output_file.open() as f: return GithubDependencyTreeTask.parse_maven_dependency_tree(f.readlines())
def archive(self, basename, sub_path=None): """ Create an archive; simply calls `git archive`. :param basename: str, name of the resulting archive, without file extension (suffix) :param sub_path: str, only add files found under this path to the archive; default: add all files from the repository (.git/ is always excluded) :return: str, filename """ suffix = "tar.gz" filename = basename + "." + suffix with cwd(self.repo_path): cmd = [ "git", "archive", "--format={}".format(suffix), "--output={}".format(filename), "HEAD" ] if sub_path: cmd.append(sub_path) TimedCommand.get_command_output(cmd) return filename
def rev_parse(self, args=None): """Run git rev-parse. :param args: arguments to pass to `git rev-parse` :return: [str], output from `git rev-parse` """ cmd = ["git", "rev-parse"] if args: cmd.extend(args) with cwd(self.repo_path): return TimedCommand.get_command_output(cmd, graceful=False)
def get_maven_dependencies(): """Get direct and indirect dependencies from pom.xml by using maven dependency tree plugin. :return: set of direct and indirect dependencies """ output_file = Path.cwd() / "dependency-tree.txt" cmd = [ "mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree", "-DoutputType=dot", "-DoutputFile={filename}".format(filename=output_file), "-DappendOutput=true" ] timed_cmd = TimedCommand(cmd) status, output, _ = timed_cmd.run(timeout=3600) if status != 0 or not output_file.is_file(): # all errors are in stdout, not stderr raise TaskError(output) with output_file.open() as f: return GithubDependencyTreeTask.parse_maven_dependency_tree( f.readlines())
def archive(self, basename, basedir=None, sub_path=None, format="tar.gz"): """Create an archive; simply calls `git archive`. :param basename: str, name of the resulting archive, without file extension (suffix) :param basedir: str, path to a directory where to store the resulting archive :param sub_path: str, only add files found under this path to the archive; default: add all files from the repository (.git/ is always excluded) :param format: str, format of the resulting archive, default: 'tar.gz' :return: str, filename """ filename = os.path.join(basedir or "", basename + "." + format) with cwd(self.repo_path): cmd = [ "git", "archive", "--format={}".format(format), "--output={}".format(filename), "HEAD" ] if sub_path: cmd.append(sub_path) TimedCommand.get_command_output(cmd) return filename
def clone(cls, url, path, timeout=300, depth=None, branch=None, single_branch=False): """Clone repository provided as url to specific path. :param url: str :param path: str :param depth: str :param branch: str :param timeout: int :return: instance of Git() """ orig_url = url # git clone doesn't understand urls starting with: git+ssh, git+http, git+https url = url2git_repo(url) orig_path = path path = Path(path) mode = 0 if path.is_dir(): mode = path.stat().st_mode cmd = ["git", "clone", url, orig_path] if depth is not None: cmd.extend(["--depth", depth]) if branch is not None: cmd.extend(["--branch", branch]) if single_branch: cmd.extend(["--single-branch"]) try: cls.config() TimedCommand.get_command_output(cmd, graceful=False, timeout=timeout) except TaskError as exc: if not path.is_dir() and mode: # 'git clone repo dir/' deletes (no way to turn this off) dir/ if cloning fails. # This might confuse caller of this method, so we recreate the dir on error here. try: path.mkdir(mode) except OSError: logger.error("Unable to re-create dir: %s", str(path)) raise TaskError("Unable to clone: %s" % orig_url) from exc return cls(path=orig_path)
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball() results = [] for path in get_all_files_from(cache_path, path_filter=skip_git_files): self.log.debug("path = %s", path) bw = TimedCommand(['binwalk', '-B', path]) status, output, error = bw.run(timeout=60) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) parsed_binwalk = self.parse_binwalk(output) results.append({ "path": os.path.relpath(path, cache_path), "output": parsed_binwalk, }) return {'summary': [], 'status': 'success', 'details': results}
def update_depcheck_db_on_s3(): """Update OWASP Dependency-check DB on S3.""" s3 = StoragePool.get_connected_storage('S3VulnDB') depcheck = configuration.dependency_check_script_path with TemporaryDirectory() as temp_data_dir: s3.retrieve_depcheck_db_if_exists(temp_data_dir) old_java_opts = os.getenv('JAVA_OPTS', '') os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit # give DependencyCheck 25 minutes to download the DB if TimedCommand.get_command_output([depcheck, '--updateonly', '--data', temp_data_dir], timeout=1500): s3.store_depcheck_db(temp_data_dir) os.environ['JAVA_OPTS'] = old_java_opts
def _generate_pom_xml(to_solve): """Create pom.xml with dependencies from to_solve. And run 'mvn versions:resolve-ranges', which resolves the version ranges (overwrites the pom.xml). :param to_solve: {"groupId:artifactId": "version-range"} """ project = etree.Element('project') etree.SubElement(project, 'modelVersion').text = '4.0.0' etree.SubElement(project, 'groupId').text = 'foo.bar.baz' etree.SubElement(project, 'artifactId').text = 'testing' etree.SubElement(project, 'version').text = '1.0.0' dependencies = etree.SubElement(project, 'dependencies') for name, version_range in to_solve.items(): group_id, artifact_id = name.rstrip(':').split(':') dependency = etree.SubElement(dependencies, 'dependency') etree.SubElement(dependency, 'groupId').text = group_id etree.SubElement(dependency, 'artifactId').text = artifact_id etree.SubElement(dependency, 'version').text = version_range with open('pom.xml', 'wb') as pom: pom.write(etree.tostring(project, xml_declaration=True, pretty_print=True)) TimedCommand.get_command_output(['mvn', 'versions:resolve-ranges'], graceful=False)
def config(): """Configure git.""" user_name = configuration.GIT_USER_NAME user_email = configuration.GIT_USER_EMAIL if not TimedCommand.get_command_output( ["git", "config", "--get", "user.name"]): TimedCommand.get_command_output( ["git", "config", "--global", "user.name", user_name]) if not TimedCommand.get_command_output( ["git", "config", "--get", "user.email"]): TimedCommand.get_command_output( ["git", "config", "--global", "user.email", user_email]) # Use 'true' as external program to ask for credentials, i.e. don't ask # Better would be GIT_TERMINAL_PROMPT=0, but that requires git >= 2.3 TimedCommand.get_command_output( ["git", "config", "--global", "core.askpass", "/usr/bin/true"])
def fetch_rubygems_artifact(name, version, target_dir): git = Git.create_git(target_dir) logger.info("downloading rubygems package %s-%s", name, version) version_arg = [] if version: version_arg = ['--version', version] gem_command = ['gem', 'fetch', name] gem_command.extend(version_arg) with cwd(target_dir): TimedCommand.get_command_output(gem_command, graceful=False) if not version: # if version is None we need to glob for the version that was downloaded artifact_path = os.path.abspath( glob.glob(os.path.join(target_dir, name + '*')).pop()) else: artifact_path = os.path.join( target_dir, '{n}-{v}.gem'.format(n=name, v=version)) digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) git.add_and_commit_everything() return digest, artifact_path
def run_gofedlib(self, topdir, name, version, timeout): """Run gofedlib-cli to extract dependencies from golang sources.""" tc = TimedCommand([ 'gofedlib-cli', '--dependencies-main', '--dependencies-packages', '--dependencies-test', '--skip-errors', topdir ]) status, data, err = tc.run(timeout=timeout) if status: raise FatalTaskError('gofedlib-cli failed: {err}'.format(err=err)) result = json.loads(data[0]) main_deps_count = len(result.get('deps-main', [])) packages_count = len(result.get('deps-packages', [])) self.log.debug('gofedlib found %i dependencies', main_deps_count + packages_count) result['code_repository'] = { 'type': 'git', 'url': 'https://{name}'.format(name=name) } result['name'] = name result['version'] = version return [{'ecosystem': 'gofedlib', 'result': result}]
def ls_remote(repository, refs=None, args=None): """Get output of `git ls-remote <args> <repo> <refs>` command. :param repository: str, remote git repository :param refs: list, list of git references :param args: list, list of additional arguments for the command :return: command output """ cmd = ["git", "ls-remote"] if args: cmd.extend(args) cmd.append(repository) if refs: cmd.extend(refs) return TimedCommand.get_command_output(cmd, graceful=False)
def extract_gem(target, dest): """Extract target gem and gemspec. Gem into $dest/sources Gemspec (renamed to rubygems-metadata.yaml) into $dest/metadata/ """ sources = os.path.join(dest, 'sources') metadata = os.path.join(dest, 'metadata') TimedCommand.get_command_output(['mkdir', '-p', sources, metadata]) TimedCommand.get_command_output(['gem', 'unpack', target, '--target', sources]) with cwd(metadata): # --spec ignores --target, so we need to cwd TimedCommand.get_command_output(['gem', 'unpack', target, '--spec']) metadatayaml = glob.glob('*.gemspec').pop() os.rename(metadatayaml, 'rubygems-metadata.yaml')
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() results = {'status': 'unknown', 'summary': {}, 'details': []} try: oscc = TimedCommand.get_command_output( ['oscryptocatcher', '--subdir-in-result', cache_path], graceful=False, is_json=True) self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc) results['details'] = oscc['details'] results['summary'] = oscc['summary'] results['status'] = 'success' except Exception: results['status'] = 'error' return results
def run_mercator(self, arguments, cache_path, keep_path=False, outermost_only=True, timeout=300, resolve_poms=True): """Run mercator tool.""" # TODO: reduce cyclomatic complexity result_data = {'status': 'unknown', 'summary': [], 'details': []} mercator_target = arguments.get('cache_sources_path', cache_path) tc = TimedCommand(['mercator', mercator_target]) update_env = { 'MERCATOR_JAVA_RESOLVE_POMS': 'true' } if resolve_poms else {} status, data, err = tc.run(timeout=timeout, is_json=True, update_env=update_env) if status != 0: self.log.error(err) raise FatalTaskError(err) ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem']) if ecosystem_object.is_backed_by(EcosystemBackend.pypi): # TODO: attempt static setup.py parsing with mercator items = [self._merge_python_items(mercator_target, data)] if items == [None]: raise NotABugFatalTaskError( 'Found no usable PKG-INFO/metadata.json/requirements.txt') else: if outermost_only: # process only root level manifests (or the ones closest to the root level) items = self._data_normalizer.get_outermost_items( data.get('items') or []) else: items = data.get('items') or [] self.log.debug('mercator found %i projects, outermost %i', len(data), len(items)) if ecosystem_object.is_backed_by(EcosystemBackend.maven): # for maven we download both Jar and POM, we consider POM to be *the* # source of information and don't want to duplicate info by including # data from pom included in artifact (assuming it's included) items = [ d for d in items if d['ecosystem'].lower() == 'java-pom' ] elif ecosystem_object.is_backed_by(EcosystemBackend.npm): # ignore other metadata files, e.g. requirements.txt items = [d for d in items if d['ecosystem'].lower() == 'npm'] elif arguments['ecosystem'] == 'go': items = [ d for d in items if d['ecosystem'].lower() == 'go-glide' ] if not items: # Mercator found no Go Glide files, run gofedlib items = self.run_gofedlib(topdir=mercator_target, name=arguments.get('name'), version=arguments.get('version'), timeout=timeout) result_data['details'] = [ self._data_normalizer.handle_data(d, keep_path=keep_path) for d in items ] result_data['status'] = 'success' return result_data
def fix_permissions(target): """Fix extracted folder permissions, so it will be readable for user.""" TimedCommand.get_command_output(['chmod', "-R", "u+rwx", target])
def extract_tar(target, dest): """Extract target tarball into dest using system 'tar' command.""" TimedCommand.get_command_output( ['tar', "--delay-directory-restore", '-xf', target, '-C', dest])
def run_mercator(self, arguments, cache_path, keep_path=False, outermost_only=True, timeout=300, resolve_poms=True): result_data = {'status': 'unknown', 'summary': [], 'details': []} mercator_target = arguments.get('cache_sources_path', cache_path) if arguments['ecosystem'] == 'go': # no Go support in Mercator-go yet, we handle it separately here tc = TimedCommand([ 'gofedlib-cli', '--dependencies-main', '--dependencies-packages', '--dependencies-test', '--skip-errors', mercator_target ]) status, data, err = tc.run(timeout=timeout) else: tc = TimedCommand(['mercator', mercator_target]) update_env = { 'MERCATOR_JAVA_RESOLVE_POMS': 'true' } if resolve_poms else {} status, data, err = tc.run(timeout=timeout, is_json=True, update_env=update_env) if status != 0: self.log.error(err) raise FatalTaskError(err) ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem']) if ecosystem_object.is_backed_by(EcosystemBackend.pypi): # TODO: attempt static setup.py parsing with mercator items = [self._merge_python_items(mercator_target, data)] elif arguments['ecosystem'] == 'go': result = {'result': json.loads(data[0])} # data normalized expects this result['ecosystem'] = 'gofedlib' # we only support git now result['result']['code_repository'] = { 'type': 'git', 'url': 'https://{name}'.format(name=arguments.get('name')) } result['result']['name'] = arguments.get('name') result['result']['version'] = arguments.get('version') items = [result] main_deps_count = len(result['result'].get('deps-main', [])) packages_count = len(result['result'].get('deps-packages', [])) self.log.debug('gofedlib found %i dependencies', main_deps_count + packages_count) else: if outermost_only: # process only root level manifests (or the ones closest to the root level) items = self._data_normalizer.get_outermost_items( data.get('items') or []) else: items = data.get('items') or [] self.log.debug('mercator found %i projects, outermost %i', len(data), len(items)) if ecosystem_object.is_backed_by(EcosystemBackend.maven): # for maven we download both Jar and POM, we consider POM to be *the* # source of information and don't want to duplicate info by including # data from pom included in artifact (assuming it's included) items = [ d for d in items if d['ecosystem'].lower() == 'java-pom' ] result_data['details'] = [ self._data_normalizer.handle_data(d, keep_path=keep_path) for d in items ] result_data['status'] = 'success' return result_data
def _run_owasp_dep_check(self, scan_path, experimental=False): """Run OWASP Dependency-Check.""" def _clean_dep_check_tmp(): for dcdir in glob(os.path.join(gettempdir(), 'dctemp*')): rmtree(dcdir) s3 = StoragePool.get_connected_storage('S3VulnDB') depcheck = configuration.dependency_check_script_path with TemporaryDirectory() as temp_data_dir: if not s3.retrieve_depcheck_db_if_exists(temp_data_dir): self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...') self.update_depcheck_db_on_s3() s3.retrieve_depcheck_db_if_exists(temp_data_dir) report_path = os.path.join(temp_data_dir, 'report.xml') command = [depcheck, '--noupdate', '--format', 'XML', '--project', 'CVEcheckerTask', '--data', temp_data_dir, '--scan', scan_path, '--out', report_path] if experimental: command.extend(['--enableExperimental']) for suppress_xml in glob(os.path.join(os.environ['OWASP_DEP_CHECK_SUPPRESS_PATH'], '*.xml')): command.extend(['--suppress', suppress_xml]) output = [] old_java_opts = os.getenv('JAVA_OPTS', '') try: self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' % scan_path) os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit output = TimedCommand.get_command_output(command, graceful=False, timeout=600) # 10 minutes with open(report_path) as r: report_dict = anymarkup.parse(r.read()) except (TaskError, FileNotFoundError) as e: _clean_dep_check_tmp() for line in output: self.log.warning(line) self.log.exception(str(e)) raise FatalTaskError('OWASP Dependency-Check scan failed') from e finally: os.environ['JAVA_OPTS'] = old_java_opts _clean_dep_check_tmp() results = [] dependencies = report_dict.get('analysis', {}).get('dependencies') # value can be None dependencies = dependencies.get('dependency', []) if dependencies else [] if not isinstance(dependencies, list): dependencies = [dependencies] for dependency in dependencies: vulnerabilities = dependency.get('vulnerabilities') # value can be None vulnerabilities = vulnerabilities.get('vulnerability', []) if vulnerabilities else [] if not isinstance(vulnerabilities, list): vulnerabilities = [vulnerabilities] for vulnerability in vulnerabilities: av = vulnerability.get('cvssAccessVector') av = av[0] if av else '?' ac = vulnerability.get('cvssAccessComplexity') ac = ac[0] if ac else '?' au = vulnerability.get('cvssAuthenticationr') au = au[0] if au else '?' c = vulnerability.get('cvssConfidentialImpact') c = c[0] if c else '?' i = vulnerability.get('cvssIntegrityImpact') i = i[0] if i else '?' a = vulnerability.get('cvssAvailabilityImpact') a = a[0] if a else '?' vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{Integrity}/A:{A}".\ format(AV=av, AC=ac, Au=au, C=c, Integrity=i, A=a) result = { 'cvss': { 'score': vulnerability.get('cvssScore'), 'vector': vector } } references = vulnerability.get('references', {}).get('reference', []) if not isinstance(references, list): references = [references] result['references'] = [r.get('url') for r in references] for field in ['severity', 'description']: result[field] = vulnerability.get(field) result['id'] = vulnerability.get('name') results.append(result) return {'summary': [r['id'] for r in results], 'status': 'success', 'details': results}
def _use_maven_index_checker(self): maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') target_dir = os.path.join(maven_index_checker_dir, 'target') central_index_dir = os.path.join(target_dir, 'central-index') timestamp_path = os.path.join(central_index_dir, 'timestamp') s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(target_dir) old_timestamp = 0 try: old_timestamp = int(os.stat(timestamp_path).st_mtime) except OSError: self.log.info( 'Timestamp is missing, we will probably need to build the index from scratch.' ) pass java_temp_dir = tempfile.mkdtemp() index_range = '{}-{}'.format(self.count.min, self.count.max) command = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar', 'maven-index-checker.jar', '-r', index_range ] if self.latest_version_only: command.append('-l') with cwd(maven_index_checker_dir): try: output = TimedCommand.get_command_output(command, is_json=True, graceful=False, timeout=1200) new_timestamp = int(os.stat(timestamp_path).st_mtime) if old_timestamp != new_timestamp: self.log.info('Storing pre-built maven index to S3...') s3.store_index(target_dir) self.log.debug('Stored. Index in S3 is up-to-date.') else: self.log.info('Index in S3 is up-to-date.') except TaskError as e: self.log.exception(e) finally: rmtree(central_index_dir) self.log.debug('central-index/ deleted') rmtree(java_temp_dir) s3data = StoragePool.get_connected_storage('S3Data') bucket = s3data._s3.Bucket(s3data.bucket_name) for idx, release in enumerate(output): name = '{}:{}'.format(release['groupId'], release['artifactId']) version = release['version'] # For now (can change in future) we want to analyze only ONE version of each package try: next( iter( bucket.objects.filter(Prefix='{e}/{p}/'.format( e=self.ecosystem, p=name)).limit(1))) self.log.info( "Analysis of some version of %s has already been scheduled, " "skipping version %s", name, version) continue except StopIteration: self.log.info("Scheduling #%d.", self.count.min + idx) self.analyses_selinon_flow(name, version)
def extract_tar(target, dest): TimedCommand.get_command_output(['tar', 'xf', target, '-C', dest])
def get_revision(target_directory): """Get digest of last commit.""" with cwd(target_directory): return TimedCommand.get_command_output( ['git', 'rev-parse', 'HEAD'], graceful=False).pop()
def extract_tar(target, dest): """Extract target tarball into dest using system 'tar' command.""" TimedCommand.get_command_output(['tar', 'xf', target, '-C', dest])
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] tool_responses = {} result_summary = { 'package_names': [], 'registered_srpms': [], 'all_rhn_channels': [], 'all_rhsm_content_sets': [], 'all_rhsm_product_names': [] } result_data = {'status': 'error', 'summary': result_summary, 'details': tool_responses } # bail out early; we need access to internal services or the package is # from Maven ecosystem, otherwise we can't comment on downstream usage is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven) if not self._is_inside_rh() and not is_maven: return result_data self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg)) res = self._fetch_anitya_project(eco, pkg) anitya_rpm_names = [] anitya_mvn_names = [] if res is None: result_data['status'] = 'error' elif res.status_code == 200: self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg)) anitya_response = res.json() tool_responses['redhat_anitya'] = anitya_response # For now, we assume all downstreams are ones we care about for entry in anitya_response['packages']: if entry['distro'] == RH_RPM_DISTRO_NAME: anitya_rpm_names.append(entry['package_name']) elif entry['distro'] == RH_MVN_DISTRO_NAME: anitya_mvn_names.append(entry['package_name']) else: self.log.warning( 'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'. format(d=entry['distro'], o=entry['package_name'], p=pkg) ) self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names)) self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names)) # TODO: Report 'partial' here and switch to 'success' at the end result_data['status'] = 'success' else: msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}' self.log.error(msg.format(e=eco, p=pkg, r=res.text)) result_data['status'] = 'error' if self._is_inside_rh(): # we have candidate downstream name mappings, check them against Brew seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)] self.log.debug('Checking candidate names in Brew: {}'.format(seed_names)) args = ['brew-utils-cli', '--version', arguments['version']] artifact_hash = self._get_artifact_hash(algorithm='sha256') if artifact_hash: args += ['--digest', artifact_hash] args += seed_names self.log.debug("Executing command, timeout={timeout}: {cmd}".format( timeout=self._BREWUTILS_CLI_TIMEOUT, cmd=args)) tc = TimedCommand(args) status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT) self.log.debug("status = %s, error = %s", status, error) output = ''.join(output) self.log.debug("output = %s", output) if not output: raise TaskError("Error running command %s" % args) brew = json.loads(output) result_summary['package_names'] = brew['packages'] result_summary['registered_srpms'] = brew['response']['registered_srpms'] tool_responses['brew'] = brew['response']['brew'] # we have SRPM details, fetch details on where the RPMs are shipped tool_responses['pulp_cdn'] = pulp_responses = [] rhn_channels = set() rhsm_content_sets = set() rhsm_product_names = set() for srpm_summary in result_summary['registered_srpms']: srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'], v=srpm_summary['version'], r=srpm_summary['release']) cdn_metadata = self._get_cdn_metadata(srpm_filename) if cdn_metadata is None: msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}' self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename)) continue pulp_responses.append(cdn_metadata) srpm_summary['published_in'] = cdn_metadata['rhsm_product_names'] rhn_channels.update(cdn_metadata['rhn_channels']) rhsm_content_sets.update(cdn_metadata['rhsm_content_sets']) rhsm_product_names.update(cdn_metadata['rhsm_product_names']) result_summary['all_rhn_channels'] = sorted(rhn_channels) result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets) result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names) self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version']) return result_data
def fetch_npm_artifact(ecosystem, name, version, target_dir): """Fetch npm artifact using system 'npm' tool.""" git = Git.create_git(target_dir) npm_cmd = ['npm', '--registry', ecosystem.fetch_url] # $ npm config get cache # /root/.npm cache_path = TimedCommand.get_command_output( npm_cmd + ['config', 'get', 'cache'], graceful=False).pop() # add package to cache: # /root/.npm/express/ # └── 4.13.4 # ├── package # │ ├── History.md # │ ├── index.js # │ ├── lib # │ ├── LICENSE # │ ├── package.json # │ └── Readme.md # └── package.tgz # 3 directories, 6 files name_ver = name try: # importing here to avoid circular dependency from f8a_worker.solver import NpmReleasesFetcher version_list = NpmReleasesFetcher(ecosystem).fetch_releases( name_ver)[1] if version not in version_list: raise NotABugTaskError( "Provided version is not supported '%s'" % name) else: name_ver = "{}@{}".format(name, version) except ValueError as e: raise NotABugTaskError( 'No versions for package NPM package {p} ({e})'.format( p=name, e=str(e))) # make sure the artifact is not in the cache yet TimedCommand.get_command_output(npm_cmd + ['cache', 'clean', name], graceful=False) logger.info("downloading npm module %s", name_ver) cmd = npm_cmd + ['cache', 'add', name_ver] TimedCommand.get_command_output(cmd, graceful=False) # copy tarball to workpath tarball_name = "package.tgz" glob_path = os.path.join(cache_path, name, "*") cache_abs_path = os.path.abspath(glob.glob(glob_path).pop()) artifact_path = os.path.join(cache_abs_path, tarball_name) logger.debug("[cache] tarball path = %s", artifact_path) artifact_path = shutil.copy(artifact_path, target_dir) logger.debug("[workdir] tarball path = %s", artifact_path) # Prior to npm-2.x.x (Fedora 24) # npm client was repackaging modules on download. It modified file permissions inside # package.tgz so they matched UID/GID of a user running npm command. Therefore its # digest was different then of a tarball downloaded directly from registry.npmjs.org. digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) Archive.fix_permissions(os.path.join(cache_abs_path, 'package')) # copy package/package.json over the extracted one, # because it contains (since npm >= 2.x.x) more information. npm_package_json = os.path.join(cache_abs_path, 'package', 'package.json') shutil.copy(npm_package_json, target_dir) # copy package/npm-shrinkwrap.json to target_dir npm_shrinkwrap_json = os.path.join(target_dir, 'package', 'npm-shrinkwrap.json') if os.path.isfile(npm_shrinkwrap_json): shutil.copy(npm_shrinkwrap_json, target_dir) git.add_and_commit_everything() return digest, artifact_path
def reset(self, revision, hard=False): cmd = ["git", "reset", revision] if hard: cmd.extend(["--hard"]) with cwd(self.repo_path): TimedCommand.get_command_output(cmd, graceful=False)
def execute(self): self.log.info("Checking maven index for new releases") maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') target_dir = os.path.join(maven_index_checker_dir, 'target') central_index_dir = os.path.join(target_dir, 'central-index') timestamp_path = os.path.join(central_index_dir, 'timestamp') s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(target_dir) old_timestamp = 0 try: old_timestamp = int(os.stat(timestamp_path).st_mtime) except OSError: self.log.info( 'Timestamp is missing, we need to build the index from scratch.' ) pass last_offset = s3.get_last_offset() with tempdir() as java_temp_dir: cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar', 'maven-index-checker.jar', '-c' ] with cwd(maven_index_checker_dir): output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=1200) current_count = output['count'] new_timestamp = int(os.stat(timestamp_path).st_mtime) if old_timestamp != new_timestamp: self.log.info('Storing pre-built maven index to S3...') s3.store_index(target_dir) self.log.debug('Stored. Index in S3 is up-to-date.') if old_timestamp == 0: s3.set_last_offset(current_count) self.log.info( 'This is first run, i.e. all packages are considered new. ' 'Skipping scheduling to not analyze all packages in index.' ) return else: self.log.info('Index in S3 is up-to-date.') self.log.debug( "Number of entries in maven indexer: %d, " "last offset used: %d", current_count, last_offset) to_schedule_count = current_count - last_offset if to_schedule_count == 0: self.log.info("No new packages to schedule, exiting...") return cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar', 'maven-index-checker.jar', '-r', '0-{}'.format(to_schedule_count) ] output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=1200) self.log.info( "Found %d new packages to analyse, scheduling analyses...", len(output)) for entry in output: self.run_selinon_flow( 'bayesianFlow', { 'ecosystem': 'maven', 'name': '{groupId}:{artifactId}'.format(**entry), 'version': entry['version'], 'recursive_limit': 0 }) s3.set_last_offset(current_count) self.log.info( "All new maven releases scheduled for analysis, exiting..")
def execute(self): """Start the analysis.""" self.log.info("Checking maven index for new releases") maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') maven_index_checker_data_dir = os.environ.get( 'MAVEN_INDEX_CHECKER_DATA_PATH', '/tmp/index-checker') os.makedirs(maven_index_checker_data_dir, exist_ok=True) central_index_dir = os.path.join(maven_index_checker_data_dir, 'central-index') timestamp_path = os.path.join(central_index_dir, 'timestamp') self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "maven_index_checker_dir", maven_index_checker_dir)) self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "maven_index_checker_data_dir", maven_index_checker_data_dir)) self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "central_index_dir", central_index_dir)) self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "timestamp_path", timestamp_path)) s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(maven_index_checker_data_dir) old_timestamp = 0 try: old_timestamp = int(os.stat(timestamp_path).st_mtime) except OSError: self.log.info( 'Timestamp is missing, we need to build the index from scratch.' ) pass last_offset = s3.get_last_offset() self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "last_offset", last_offset)) java_temp_dir = tempfile.mkdtemp(prefix='tmp-', dir=os.environ.get('PV_DIR', '/tmp')) cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-DcentralIndexDir={}'.format(central_index_dir), '-jar', 'maven-index-checker.jar', '-c' ] self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "cmd1", cmd)) with cwd(maven_index_checker_dir): try: output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=10800) self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "output", output)) current_count = output['count'] new_timestamp = int(os.stat(timestamp_path).st_mtime) if old_timestamp != new_timestamp: self.log.info('Storing pre-built maven index to S3...') s3.store_index(maven_index_checker_data_dir) self.log.debug('Stored. Index in S3 is up-to-date.') if old_timestamp == 0: s3.set_last_offset(current_count) self.log.info( 'This is first run, i.e. all packages are considered new. ' 'Skipping scheduling to not analyze all packages in index.' ) return else: self.log.info('Index in S3 is up-to-date.') self.log.debug( "Number of entries in maven indexer: %d, " "last offset used: %d", current_count, last_offset) to_schedule_count = current_count - last_offset if to_schedule_count == 0: self.log.info("No new packages to schedule, exiting...") return cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-DcentralIndexDir={}'.format(central_index_dir), '-jar', 'maven-index-checker.jar', '-r', '0-{}'.format(to_schedule_count) ] self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "cmd2", cmd)) output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=10800) self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "output", output)) except TaskError as e: self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "TaskError", e)) self.log.exception(e) raise finally: rmtree(central_index_dir) self.log.debug('central-index/ deleted') rmtree(java_temp_dir) self.log.info( "Found %d new packages to analyse, scheduling analyses...", len(output)) for entry in output: self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "Running ingestion for", entry)) self.run_selinon_flow( 'bayesianFlow', { 'ecosystem': 'maven', 'name': '{groupId}:{artifactId}'.format(**entry), 'version': entry['version'], 'recursive_limit': 0 }) self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "current_count", current_count)) s3.set_last_offset(current_count) self.log.info( "All new maven releases scheduled for analysis, exiting..")