def extract_dependencies(github_repo, github_sha):
        """Extract the dependencies information.

        Currently assuming repository is maven repository.
        """
        with TemporaryDirectory() as workdir:
            repo = Git.clone(url=github_repo, path=workdir, timeout=3600)
            repo.reset(revision=github_sha, hard=True)
            with cwd(repo.repo_path):
                output_file = Path.cwd() / "dependency-tree.txt"
                cmd = ["mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree",
                       "-DoutputType=dot",
                       "-DoutputFile={filename}".format(filename=output_file),
                       "-DappendOutput=true"]
                timed_cmd = TimedCommand(cmd)
                status, output, _ = timed_cmd.run(timeout=3600)
                if status != 0 or not output_file.is_file():
                    # all errors are in stdout, not stderr
                    raise TaskError(output)
                with output_file.open() as f:
                    return GithubDependencyTreeTask.parse_maven_dependency_tree(f.readlines())
    def archive(self, basename, sub_path=None):
        """
        Create an archive; simply calls `git archive`.

        :param basename: str, name of the resulting archive, without file extension (suffix)
        :param sub_path: str, only add files found under this path to the archive;
                          default: add all files from the repository (.git/ is always excluded)
        :return: str, filename
        """
        suffix = "tar.gz"
        filename = basename + "." + suffix
        with cwd(self.repo_path):
            cmd = [
                "git", "archive", "--format={}".format(suffix),
                "--output={}".format(filename), "HEAD"
            ]
            if sub_path:
                cmd.append(sub_path)
            TimedCommand.get_command_output(cmd)

        return filename
    def rev_parse(self, args=None):
        """Run git rev-parse.

        :param args: arguments to pass to `git rev-parse`
        :return: [str], output from `git rev-parse`
        """
        cmd = ["git", "rev-parse"]
        if args:
            cmd.extend(args)

        with cwd(self.repo_path):
            return TimedCommand.get_command_output(cmd, graceful=False)
Beispiel #4
0
    def get_maven_dependencies():
        """Get direct and indirect dependencies from pom.xml by using maven dependency tree plugin.

        :return: set of direct and indirect dependencies
        """
        output_file = Path.cwd() / "dependency-tree.txt"
        cmd = [
            "mvn",
            "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree",
            "-DoutputType=dot",
            "-DoutputFile={filename}".format(filename=output_file),
            "-DappendOutput=true"
        ]
        timed_cmd = TimedCommand(cmd)
        status, output, _ = timed_cmd.run(timeout=3600)
        if status != 0 or not output_file.is_file():
            # all errors are in stdout, not stderr
            raise TaskError(output)
        with output_file.open() as f:
            return GithubDependencyTreeTask.parse_maven_dependency_tree(
                f.readlines())
    def archive(self, basename, basedir=None, sub_path=None, format="tar.gz"):
        """Create an archive; simply calls `git archive`.

        :param basename: str, name of the resulting archive, without file extension (suffix)
        :param basedir: str, path to a directory where to store the resulting archive
        :param sub_path: str, only add files found under this path to the archive;
                          default: add all files from the repository (.git/ is always excluded)
        :param format: str, format of the resulting archive, default: 'tar.gz'
        :return: str, filename
        """
        filename = os.path.join(basedir or "", basename + "." + format)
        with cwd(self.repo_path):
            cmd = [
                "git", "archive", "--format={}".format(format),
                "--output={}".format(filename), "HEAD"
            ]
            if sub_path:
                cmd.append(sub_path)
            TimedCommand.get_command_output(cmd)

        return filename
Beispiel #6
0
    def clone(cls, url, path, timeout=300, depth=None, branch=None, single_branch=False):
        """Clone repository provided as url to specific path.

        :param url: str
        :param path: str
        :param depth: str
        :param branch: str
        :param timeout: int
        :return: instance of Git()
        """
        orig_url = url
        # git clone doesn't understand urls starting with: git+ssh, git+http, git+https
        url = url2git_repo(url)

        orig_path = path
        path = Path(path)
        mode = 0
        if path.is_dir():
            mode = path.stat().st_mode

        cmd = ["git", "clone", url, orig_path]
        if depth is not None:
            cmd.extend(["--depth", depth])
        if branch is not None:
            cmd.extend(["--branch", branch])
        if single_branch:
            cmd.extend(["--single-branch"])
        try:
            cls.config()
            TimedCommand.get_command_output(cmd, graceful=False, timeout=timeout)
        except TaskError as exc:
            if not path.is_dir() and mode:
                # 'git clone repo dir/' deletes (no way to turn this off) dir/ if cloning fails.
                # This might confuse caller of this method, so we recreate the dir on error here.
                try:
                    path.mkdir(mode)
                except OSError:
                    logger.error("Unable to re-create dir: %s", str(path))
            raise TaskError("Unable to clone: %s" % orig_url) from exc
        return cls(path=orig_path)
Beispiel #7
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball()

        results = []
        for path in get_all_files_from(cache_path, path_filter=skip_git_files):
            self.log.debug("path = %s", path)

            bw = TimedCommand(['binwalk', '-B', path])
            status, output, error = bw.run(timeout=60)
            self.log.debug("status = %s, error = %s", status, error)
            self.log.debug("output = %s", output)

            parsed_binwalk = self.parse_binwalk(output)
            results.append({
                "path": os.path.relpath(path, cache_path),
                "output": parsed_binwalk,
            })
        return {'summary': [], 'status': 'success', 'details': results}
 def update_depcheck_db_on_s3():
     """Update OWASP Dependency-check DB on S3."""
     s3 = StoragePool.get_connected_storage('S3VulnDB')
     depcheck = configuration.dependency_check_script_path
     with TemporaryDirectory() as temp_data_dir:
         s3.retrieve_depcheck_db_if_exists(temp_data_dir)
         old_java_opts = os.getenv('JAVA_OPTS', '')
         os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit
         # give DependencyCheck 25 minutes to download the DB
         if TimedCommand.get_command_output([depcheck, '--updateonly', '--data', temp_data_dir],
                                            timeout=1500):
             s3.store_depcheck_db(temp_data_dir)
         os.environ['JAVA_OPTS'] = old_java_opts
    def _generate_pom_xml(to_solve):
        """Create pom.xml with dependencies from to_solve.

        And run 'mvn versions:resolve-ranges',
        which resolves the version ranges (overwrites the pom.xml).

        :param to_solve: {"groupId:artifactId": "version-range"}
        """
        project = etree.Element('project')
        etree.SubElement(project, 'modelVersion').text = '4.0.0'
        etree.SubElement(project, 'groupId').text = 'foo.bar.baz'
        etree.SubElement(project, 'artifactId').text = 'testing'
        etree.SubElement(project, 'version').text = '1.0.0'
        dependencies = etree.SubElement(project, 'dependencies')
        for name, version_range in to_solve.items():
            group_id, artifact_id = name.rstrip(':').split(':')
            dependency = etree.SubElement(dependencies, 'dependency')
            etree.SubElement(dependency, 'groupId').text = group_id
            etree.SubElement(dependency, 'artifactId').text = artifact_id
            etree.SubElement(dependency, 'version').text = version_range
        with open('pom.xml', 'wb') as pom:
            pom.write(etree.tostring(project, xml_declaration=True, pretty_print=True))
        TimedCommand.get_command_output(['mvn', 'versions:resolve-ranges'], graceful=False)
 def config():
     """Configure git."""
     user_name = configuration.GIT_USER_NAME
     user_email = configuration.GIT_USER_EMAIL
     if not TimedCommand.get_command_output(
         ["git", "config", "--get", "user.name"]):
         TimedCommand.get_command_output(
             ["git", "config", "--global", "user.name", user_name])
     if not TimedCommand.get_command_output(
         ["git", "config", "--get", "user.email"]):
         TimedCommand.get_command_output(
             ["git", "config", "--global", "user.email", user_email])
     # Use 'true' as external program to ask for credentials, i.e. don't ask
     # Better would be GIT_TERMINAL_PROMPT=0, but that requires git >= 2.3
     TimedCommand.get_command_output(
         ["git", "config", "--global", "core.askpass", "/usr/bin/true"])
    def fetch_rubygems_artifact(name, version, target_dir):
        git = Git.create_git(target_dir)
        logger.info("downloading rubygems package %s-%s", name, version)
        version_arg = []
        if version:
            version_arg = ['--version', version]
        gem_command = ['gem', 'fetch', name]
        gem_command.extend(version_arg)
        with cwd(target_dir):
            TimedCommand.get_command_output(gem_command, graceful=False)

        if not version:
            # if version is None we need to glob for the version that was downloaded
            artifact_path = os.path.abspath(
                glob.glob(os.path.join(target_dir, name + '*')).pop())
        else:
            artifact_path = os.path.join(
                target_dir, '{n}-{v}.gem'.format(n=name, v=version))

        digest = compute_digest(artifact_path)
        Archive.extract(artifact_path, target_dir)
        git.add_and_commit_everything()
        return digest, artifact_path
Beispiel #12
0
    def run_gofedlib(self, topdir, name, version, timeout):
        """Run gofedlib-cli to extract dependencies from golang sources."""
        tc = TimedCommand([
            'gofedlib-cli', '--dependencies-main', '--dependencies-packages',
            '--dependencies-test', '--skip-errors', topdir
        ])
        status, data, err = tc.run(timeout=timeout)

        if status:
            raise FatalTaskError('gofedlib-cli failed: {err}'.format(err=err))

        result = json.loads(data[0])
        main_deps_count = len(result.get('deps-main', []))
        packages_count = len(result.get('deps-packages', []))
        self.log.debug('gofedlib found %i dependencies',
                       main_deps_count + packages_count)

        result['code_repository'] = {
            'type': 'git',
            'url': 'https://{name}'.format(name=name)
        }
        result['name'] = name
        result['version'] = version
        return [{'ecosystem': 'gofedlib', 'result': result}]
    def ls_remote(repository, refs=None, args=None):
        """Get output of `git ls-remote <args> <repo> <refs>` command.

        :param repository: str, remote git repository
        :param refs: list, list of git references
        :param args: list, list of additional arguments for the command
        :return: command output
        """
        cmd = ["git", "ls-remote"]
        if args:
            cmd.extend(args)

        cmd.append(repository)

        if refs:
            cmd.extend(refs)

        return TimedCommand.get_command_output(cmd, graceful=False)
Beispiel #14
0
    def extract_gem(target, dest):
        """Extract target gem and gemspec.

        Gem into $dest/sources
        Gemspec (renamed to rubygems-metadata.yaml) into $dest/metadata/
        """
        sources = os.path.join(dest, 'sources')
        metadata = os.path.join(dest, 'metadata')
        TimedCommand.get_command_output(['mkdir', '-p', sources, metadata])
        TimedCommand.get_command_output(['gem', 'unpack', target, '--target', sources])
        with cwd(metadata):
            # --spec ignores --target, so we need to cwd
            TimedCommand.get_command_output(['gem', 'unpack', target, '--spec'])
            metadatayaml = glob.glob('*.gemspec').pop()
            os.rename(metadatayaml, 'rubygems-metadata.yaml')
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(
            arguments).get_extracted_source_tarball()

        results = {'status': 'unknown', 'summary': {}, 'details': []}

        try:
            oscc = TimedCommand.get_command_output(
                ['oscryptocatcher', '--subdir-in-result', cache_path],
                graceful=False,
                is_json=True)

            self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc)
            results['details'] = oscc['details']
            results['summary'] = oscc['summary']
            results['status'] = 'success'
        except Exception:
            results['status'] = 'error'

        return results
    def run_mercator(self,
                     arguments,
                     cache_path,
                     keep_path=False,
                     outermost_only=True,
                     timeout=300,
                     resolve_poms=True):
        """Run mercator tool."""
        # TODO: reduce cyclomatic complexity
        result_data = {'status': 'unknown', 'summary': [], 'details': []}
        mercator_target = arguments.get('cache_sources_path', cache_path)

        tc = TimedCommand(['mercator', mercator_target])
        update_env = {
            'MERCATOR_JAVA_RESOLVE_POMS': 'true'
        } if resolve_poms else {}
        status, data, err = tc.run(timeout=timeout,
                                   is_json=True,
                                   update_env=update_env)
        if status != 0:
            self.log.error(err)
            raise FatalTaskError(err)

        ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem'])
        if ecosystem_object.is_backed_by(EcosystemBackend.pypi):
            # TODO: attempt static setup.py parsing with mercator
            items = [self._merge_python_items(mercator_target, data)]
            if items == [None]:
                raise NotABugFatalTaskError(
                    'Found no usable PKG-INFO/metadata.json/requirements.txt')
        else:
            if outermost_only:
                # process only root level manifests (or the ones closest to the root level)
                items = self._data_normalizer.get_outermost_items(
                    data.get('items') or [])
            else:
                items = data.get('items') or []
            self.log.debug('mercator found %i projects, outermost %i',
                           len(data), len(items))

            if ecosystem_object.is_backed_by(EcosystemBackend.maven):
                # for maven we download both Jar and POM, we consider POM to be *the*
                #  source of information and don't want to duplicate info by including
                #  data from pom included in artifact (assuming it's included)
                items = [
                    d for d in items if d['ecosystem'].lower() == 'java-pom'
                ]
            elif ecosystem_object.is_backed_by(EcosystemBackend.npm):
                # ignore other metadata files, e.g. requirements.txt
                items = [d for d in items if d['ecosystem'].lower() == 'npm']
            elif arguments['ecosystem'] == 'go':
                items = [
                    d for d in items if d['ecosystem'].lower() == 'go-glide'
                ]
                if not items:
                    # Mercator found no Go Glide files, run gofedlib
                    items = self.run_gofedlib(topdir=mercator_target,
                                              name=arguments.get('name'),
                                              version=arguments.get('version'),
                                              timeout=timeout)

        result_data['details'] = [
            self._data_normalizer.handle_data(d, keep_path=keep_path)
            for d in items
        ]
        result_data['status'] = 'success'
        return result_data
 def fix_permissions(target):
     """Fix extracted folder permissions, so it will be readable for user."""
     TimedCommand.get_command_output(['chmod', "-R", "u+rwx", target])
 def extract_tar(target, dest):
     """Extract target tarball into dest using system 'tar' command."""
     TimedCommand.get_command_output(
         ['tar', "--delay-directory-restore", '-xf', target, '-C', dest])
Beispiel #19
0
    def run_mercator(self,
                     arguments,
                     cache_path,
                     keep_path=False,
                     outermost_only=True,
                     timeout=300,
                     resolve_poms=True):
        result_data = {'status': 'unknown', 'summary': [], 'details': []}
        mercator_target = arguments.get('cache_sources_path', cache_path)

        if arguments['ecosystem'] == 'go':
            # no Go support in Mercator-go yet, we handle it separately here
            tc = TimedCommand([
                'gofedlib-cli', '--dependencies-main',
                '--dependencies-packages', '--dependencies-test',
                '--skip-errors', mercator_target
            ])
            status, data, err = tc.run(timeout=timeout)
        else:
            tc = TimedCommand(['mercator', mercator_target])
            update_env = {
                'MERCATOR_JAVA_RESOLVE_POMS': 'true'
            } if resolve_poms else {}
            status, data, err = tc.run(timeout=timeout,
                                       is_json=True,
                                       update_env=update_env)
        if status != 0:
            self.log.error(err)
            raise FatalTaskError(err)

        ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem'])
        if ecosystem_object.is_backed_by(EcosystemBackend.pypi):
            # TODO: attempt static setup.py parsing with mercator
            items = [self._merge_python_items(mercator_target, data)]
        elif arguments['ecosystem'] == 'go':
            result = {'result': json.loads(data[0])}
            # data normalized expects this
            result['ecosystem'] = 'gofedlib'
            # we only support git now
            result['result']['code_repository'] = {
                'type': 'git',
                'url': 'https://{name}'.format(name=arguments.get('name'))
            }

            result['result']['name'] = arguments.get('name')
            result['result']['version'] = arguments.get('version')
            items = [result]
            main_deps_count = len(result['result'].get('deps-main', []))
            packages_count = len(result['result'].get('deps-packages', []))
            self.log.debug('gofedlib found %i dependencies',
                           main_deps_count + packages_count)
        else:
            if outermost_only:
                # process only root level manifests (or the ones closest to the root level)
                items = self._data_normalizer.get_outermost_items(
                    data.get('items') or [])
            else:
                items = data.get('items') or []
            self.log.debug('mercator found %i projects, outermost %i',
                           len(data), len(items))

            if ecosystem_object.is_backed_by(EcosystemBackend.maven):
                # for maven we download both Jar and POM, we consider POM to be *the*
                #  source of information and don't want to duplicate info by including
                #  data from pom included in artifact (assuming it's included)
                items = [
                    d for d in items if d['ecosystem'].lower() == 'java-pom'
                ]

        result_data['details'] = [
            self._data_normalizer.handle_data(d, keep_path=keep_path)
            for d in items
        ]
        result_data['status'] = 'success'
        return result_data
    def _run_owasp_dep_check(self, scan_path, experimental=False):
        """Run OWASP Dependency-Check."""
        def _clean_dep_check_tmp():
            for dcdir in glob(os.path.join(gettempdir(), 'dctemp*')):
                rmtree(dcdir)

        s3 = StoragePool.get_connected_storage('S3VulnDB')
        depcheck = configuration.dependency_check_script_path
        with TemporaryDirectory() as temp_data_dir:
            if not s3.retrieve_depcheck_db_if_exists(temp_data_dir):
                self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...')
                self.update_depcheck_db_on_s3()
                s3.retrieve_depcheck_db_if_exists(temp_data_dir)

            report_path = os.path.join(temp_data_dir, 'report.xml')
            command = [depcheck,
                       '--noupdate',
                       '--format', 'XML',
                       '--project', 'CVEcheckerTask',
                       '--data', temp_data_dir,
                       '--scan', scan_path,
                       '--out', report_path]
            if experimental:
                command.extend(['--enableExperimental'])
            for suppress_xml in glob(os.path.join(os.environ['OWASP_DEP_CHECK_SUPPRESS_PATH'],
                                                  '*.xml')):
                command.extend(['--suppress', suppress_xml])

            output = []
            old_java_opts = os.getenv('JAVA_OPTS', '')
            try:
                self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' %
                               scan_path)
                os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit
                output = TimedCommand.get_command_output(command,
                                                         graceful=False,
                                                         timeout=600)  # 10 minutes
                with open(report_path) as r:
                    report_dict = anymarkup.parse(r.read())
            except (TaskError, FileNotFoundError) as e:
                _clean_dep_check_tmp()
                for line in output:
                    self.log.warning(line)
                self.log.exception(str(e))
                raise FatalTaskError('OWASP Dependency-Check scan failed') from e
            finally:
                os.environ['JAVA_OPTS'] = old_java_opts
            _clean_dep_check_tmp()

        results = []
        dependencies = report_dict.get('analysis', {}).get('dependencies')  # value can be None
        dependencies = dependencies.get('dependency', []) if dependencies else []
        if not isinstance(dependencies, list):
            dependencies = [dependencies]
        for dependency in dependencies:
            vulnerabilities = dependency.get('vulnerabilities')  # value can be None
            vulnerabilities = vulnerabilities.get('vulnerability', []) if vulnerabilities else []
            if not isinstance(vulnerabilities, list):
                vulnerabilities = [vulnerabilities]
            for vulnerability in vulnerabilities:
                av = vulnerability.get('cvssAccessVector')
                av = av[0] if av else '?'
                ac = vulnerability.get('cvssAccessComplexity')
                ac = ac[0] if ac else '?'
                au = vulnerability.get('cvssAuthenticationr')
                au = au[0] if au else '?'
                c = vulnerability.get('cvssConfidentialImpact')
                c = c[0] if c else '?'
                i = vulnerability.get('cvssIntegrityImpact')
                i = i[0] if i else '?'
                a = vulnerability.get('cvssAvailabilityImpact')
                a = a[0] if a else '?'
                vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{Integrity}/A:{A}".\
                    format(AV=av, AC=ac, Au=au, C=c, Integrity=i, A=a)
                result = {
                    'cvss': {
                        'score': vulnerability.get('cvssScore'),
                        'vector': vector
                    }
                }
                references = vulnerability.get('references', {}).get('reference', [])
                if not isinstance(references, list):
                    references = [references]
                result['references'] = [r.get('url') for r in references]
                for field in ['severity', 'description']:
                    result[field] = vulnerability.get(field)
                result['id'] = vulnerability.get('name')
                results.append(result)

        return {'summary': [r['id'] for r in results],
                'status': 'success',
                'details': results}
    def _use_maven_index_checker(self):
        maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH')
        target_dir = os.path.join(maven_index_checker_dir, 'target')
        central_index_dir = os.path.join(target_dir, 'central-index')
        timestamp_path = os.path.join(central_index_dir, 'timestamp')

        s3 = StoragePool.get_connected_storage('S3MavenIndex')
        self.log.info('Fetching pre-built maven index from S3, if available.')
        s3.retrieve_index_if_exists(target_dir)

        old_timestamp = 0
        try:
            old_timestamp = int(os.stat(timestamp_path).st_mtime)
        except OSError:
            self.log.info(
                'Timestamp is missing, we will probably need to build the index from scratch.'
            )
            pass

        java_temp_dir = tempfile.mkdtemp()

        index_range = '{}-{}'.format(self.count.min, self.count.max)
        command = [
            'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir),
            '-jar', 'maven-index-checker.jar', '-r', index_range
        ]
        if self.latest_version_only:
            command.append('-l')
        with cwd(maven_index_checker_dir):
            try:
                output = TimedCommand.get_command_output(command,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=1200)

                new_timestamp = int(os.stat(timestamp_path).st_mtime)
                if old_timestamp != new_timestamp:
                    self.log.info('Storing pre-built maven index to S3...')
                    s3.store_index(target_dir)
                    self.log.debug('Stored. Index in S3 is up-to-date.')
                else:
                    self.log.info('Index in S3 is up-to-date.')
            except TaskError as e:
                self.log.exception(e)
            finally:
                rmtree(central_index_dir)
                self.log.debug('central-index/ deleted')
                rmtree(java_temp_dir)

            s3data = StoragePool.get_connected_storage('S3Data')
            bucket = s3data._s3.Bucket(s3data.bucket_name)
            for idx, release in enumerate(output):
                name = '{}:{}'.format(release['groupId'],
                                      release['artifactId'])
                version = release['version']
                # For now (can change in future) we want to analyze only ONE version of each package
                try:
                    next(
                        iter(
                            bucket.objects.filter(Prefix='{e}/{p}/'.format(
                                e=self.ecosystem, p=name)).limit(1)))
                    self.log.info(
                        "Analysis of some version of %s has already been scheduled, "
                        "skipping version %s", name, version)
                    continue
                except StopIteration:
                    self.log.info("Scheduling #%d.", self.count.min + idx)
                    self.analyses_selinon_flow(name, version)
 def extract_tar(target, dest):
     TimedCommand.get_command_output(['tar', 'xf', target, '-C', dest])
 def get_revision(target_directory):
     """Get digest of last commit."""
     with cwd(target_directory):
         return TimedCommand.get_command_output(
             ['git', 'rev-parse', 'HEAD'], graceful=False).pop()
 def extract_tar(target, dest):
     """Extract target tarball into dest using system 'tar' command."""
     TimedCommand.get_command_output(['tar', 'xf', target, '-C', dest])
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        eco = arguments['ecosystem']
        pkg = arguments['name']
        tool_responses = {}
        result_summary = {
            'package_names': [],
            'registered_srpms': [],
            'all_rhn_channels': [],
            'all_rhsm_content_sets': [],
            'all_rhsm_product_names': []
        }
        result_data = {'status': 'error',
                       'summary': result_summary,
                       'details': tool_responses
                       }

        # bail out early; we need access to internal services or the package is
        # from Maven ecosystem, otherwise we can't comment on downstream usage
        is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven)
        if not self._is_inside_rh() and not is_maven:
            return result_data

        self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg))
        res = self._fetch_anitya_project(eco, pkg)
        anitya_rpm_names = []
        anitya_mvn_names = []
        if res is None:
            result_data['status'] = 'error'
        elif res.status_code == 200:
            self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg))
            anitya_response = res.json()
            tool_responses['redhat_anitya'] = anitya_response
            # For now, we assume all downstreams are ones we care about
            for entry in anitya_response['packages']:
                if entry['distro'] == RH_RPM_DISTRO_NAME:
                    anitya_rpm_names.append(entry['package_name'])
                elif entry['distro'] == RH_MVN_DISTRO_NAME:
                    anitya_mvn_names.append(entry['package_name'])
                else:
                    self.log.warning(
                        'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'.
                        format(d=entry['distro'], o=entry['package_name'], p=pkg)
                    )
            self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names))
            self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names))
            # TODO: Report 'partial' here and switch to 'success' at the end
            result_data['status'] = 'success'
        else:
            msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}'
            self.log.error(msg.format(e=eco, p=pkg, r=res.text))
            result_data['status'] = 'error'

        if self._is_inside_rh():
            # we have candidate downstream name mappings, check them against Brew
            seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)]
            self.log.debug('Checking candidate names in Brew: {}'.format(seed_names))

            args = ['brew-utils-cli', '--version', arguments['version']]
            artifact_hash = self._get_artifact_hash(algorithm='sha256')
            if artifact_hash:
                args += ['--digest', artifact_hash]
            args += seed_names

            self.log.debug("Executing command, timeout={timeout}: {cmd}".format(
                timeout=self._BREWUTILS_CLI_TIMEOUT,
                cmd=args))
            tc = TimedCommand(args)
            status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT)
            self.log.debug("status = %s, error = %s", status, error)
            output = ''.join(output)
            self.log.debug("output = %s", output)
            if not output:
                raise TaskError("Error running command %s" % args)
            brew = json.loads(output)

            result_summary['package_names'] = brew['packages']
            result_summary['registered_srpms'] = brew['response']['registered_srpms']
            tool_responses['brew'] = brew['response']['brew']

            # we have SRPM details, fetch details on where the RPMs are shipped
            tool_responses['pulp_cdn'] = pulp_responses = []
            rhn_channels = set()
            rhsm_content_sets = set()
            rhsm_product_names = set()
            for srpm_summary in result_summary['registered_srpms']:
                srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'],
                                                             v=srpm_summary['version'],
                                                             r=srpm_summary['release'])
                cdn_metadata = self._get_cdn_metadata(srpm_filename)
                if cdn_metadata is None:
                    msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}'
                    self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename))
                    continue
                pulp_responses.append(cdn_metadata)
                srpm_summary['published_in'] = cdn_metadata['rhsm_product_names']
                rhn_channels.update(cdn_metadata['rhn_channels'])
                rhsm_content_sets.update(cdn_metadata['rhsm_content_sets'])
                rhsm_product_names.update(cdn_metadata['rhsm_product_names'])
            result_summary['all_rhn_channels'] = sorted(rhn_channels)
            result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets)
            result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names)

        self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version'])

        return result_data
    def fetch_npm_artifact(ecosystem, name, version, target_dir):
        """Fetch npm artifact using system 'npm' tool."""
        git = Git.create_git(target_dir)

        npm_cmd = ['npm', '--registry', ecosystem.fetch_url]

        # $ npm config get cache
        # /root/.npm
        cache_path = TimedCommand.get_command_output(
            npm_cmd + ['config', 'get', 'cache'], graceful=False).pop()

        # add package to cache:
        # /root/.npm/express/
        # └── 4.13.4
        #      ├── package
        #      │   ├── History.md
        #      │   ├── index.js
        #      │   ├── lib
        #      │   ├── LICENSE
        #      │   ├── package.json
        #      │   └── Readme.md
        #      └── package.tgz
        # 3 directories, 6 files
        name_ver = name

        try:
            # importing here to avoid circular dependency
            from f8a_worker.solver import NpmReleasesFetcher

            version_list = NpmReleasesFetcher(ecosystem).fetch_releases(
                name_ver)[1]
            if version not in version_list:
                raise NotABugTaskError(
                    "Provided version is not supported '%s'" % name)
            else:
                name_ver = "{}@{}".format(name, version)
        except ValueError as e:
            raise NotABugTaskError(
                'No versions for package NPM package {p} ({e})'.format(
                    p=name, e=str(e)))

        # make sure the artifact is not in the cache yet
        TimedCommand.get_command_output(npm_cmd + ['cache', 'clean', name],
                                        graceful=False)
        logger.info("downloading npm module %s", name_ver)
        cmd = npm_cmd + ['cache', 'add', name_ver]
        TimedCommand.get_command_output(cmd, graceful=False)

        # copy tarball to workpath
        tarball_name = "package.tgz"
        glob_path = os.path.join(cache_path, name, "*")
        cache_abs_path = os.path.abspath(glob.glob(glob_path).pop())
        artifact_path = os.path.join(cache_abs_path, tarball_name)
        logger.debug("[cache] tarball path = %s", artifact_path)
        artifact_path = shutil.copy(artifact_path, target_dir)

        logger.debug("[workdir] tarball path = %s", artifact_path)
        # Prior to npm-2.x.x (Fedora 24)
        # npm client was repackaging modules on download. It modified file permissions inside
        # package.tgz so they matched UID/GID of a user running npm command. Therefore its
        # digest was different then of a tarball downloaded directly from registry.npmjs.org.
        digest = compute_digest(artifact_path)
        Archive.extract(artifact_path, target_dir)
        Archive.fix_permissions(os.path.join(cache_abs_path, 'package'))

        # copy package/package.json over the extracted one,
        # because it contains (since npm >= 2.x.x) more information.
        npm_package_json = os.path.join(cache_abs_path, 'package',
                                        'package.json')
        shutil.copy(npm_package_json, target_dir)
        # copy package/npm-shrinkwrap.json to target_dir
        npm_shrinkwrap_json = os.path.join(target_dir, 'package',
                                           'npm-shrinkwrap.json')
        if os.path.isfile(npm_shrinkwrap_json):
            shutil.copy(npm_shrinkwrap_json, target_dir)
        git.add_and_commit_everything()
        return digest, artifact_path
 def reset(self, revision, hard=False):
     cmd = ["git", "reset", revision]
     if hard:
         cmd.extend(["--hard"])
     with cwd(self.repo_path):
         TimedCommand.get_command_output(cmd, graceful=False)
    def execute(self):
        self.log.info("Checking maven index for new releases")
        maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH')
        target_dir = os.path.join(maven_index_checker_dir, 'target')
        central_index_dir = os.path.join(target_dir, 'central-index')
        timestamp_path = os.path.join(central_index_dir, 'timestamp')

        s3 = StoragePool.get_connected_storage('S3MavenIndex')
        self.log.info('Fetching pre-built maven index from S3, if available.')
        s3.retrieve_index_if_exists(target_dir)

        old_timestamp = 0
        try:
            old_timestamp = int(os.stat(timestamp_path).st_mtime)
        except OSError:
            self.log.info(
                'Timestamp is missing, we need to build the index from scratch.'
            )
            pass

        last_offset = s3.get_last_offset()
        with tempdir() as java_temp_dir:
            cmd = [
                'java', '-Xmx768m',
                '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar',
                'maven-index-checker.jar', '-c'
            ]

            with cwd(maven_index_checker_dir):
                output = TimedCommand.get_command_output(cmd,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=1200)

                current_count = output['count']
                new_timestamp = int(os.stat(timestamp_path).st_mtime)
                if old_timestamp != new_timestamp:
                    self.log.info('Storing pre-built maven index to S3...')
                    s3.store_index(target_dir)
                    self.log.debug('Stored. Index in S3 is up-to-date.')
                    if old_timestamp == 0:
                        s3.set_last_offset(current_count)
                        self.log.info(
                            'This is first run, i.e. all packages are considered new. '
                            'Skipping scheduling to not analyze all packages in index.'
                        )
                        return
                else:
                    self.log.info('Index in S3 is up-to-date.')

                self.log.debug(
                    "Number of entries in maven indexer: %d, "
                    "last offset used: %d", current_count, last_offset)
                to_schedule_count = current_count - last_offset
                if to_schedule_count == 0:
                    self.log.info("No new packages to schedule, exiting...")
                    return

                cmd = [
                    'java', '-Xmx768m',
                    '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar',
                    'maven-index-checker.jar', '-r',
                    '0-{}'.format(to_schedule_count)
                ]
                output = TimedCommand.get_command_output(cmd,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=1200)

                self.log.info(
                    "Found %d new packages to analyse, scheduling analyses...",
                    len(output))
                for entry in output:
                    self.run_selinon_flow(
                        'bayesianFlow', {
                            'ecosystem': 'maven',
                            'name': '{groupId}:{artifactId}'.format(**entry),
                            'version': entry['version'],
                            'recursive_limit': 0
                        })

        s3.set_last_offset(current_count)
        self.log.info(
            "All new maven releases scheduled for analysis, exiting..")
Beispiel #29
0
    def execute(self):
        """Start the analysis."""
        self.log.info("Checking maven index for new releases")
        maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH')
        maven_index_checker_data_dir = os.environ.get(
            'MAVEN_INDEX_CHECKER_DATA_PATH', '/tmp/index-checker')
        os.makedirs(maven_index_checker_data_dir, exist_ok=True)
        central_index_dir = os.path.join(maven_index_checker_data_dir,
                                         'central-index')
        timestamp_path = os.path.join(central_index_dir, 'timestamp')

        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "maven_index_checker_dir",
                                                maven_index_checker_dir))
        self.log.info("{}__:__{}__:__{}".format(
            "MavenReleasesAnalyses", "maven_index_checker_data_dir",
            maven_index_checker_data_dir))
        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "central_index_dir",
                                                central_index_dir))
        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "timestamp_path",
                                                timestamp_path))

        s3 = StoragePool.get_connected_storage('S3MavenIndex')
        self.log.info('Fetching pre-built maven index from S3, if available.')
        s3.retrieve_index_if_exists(maven_index_checker_data_dir)

        old_timestamp = 0
        try:
            old_timestamp = int(os.stat(timestamp_path).st_mtime)
        except OSError:
            self.log.info(
                'Timestamp is missing, we need to build the index from scratch.'
            )
            pass

        last_offset = s3.get_last_offset()
        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "last_offset", last_offset))

        java_temp_dir = tempfile.mkdtemp(prefix='tmp-',
                                         dir=os.environ.get('PV_DIR', '/tmp'))

        cmd = [
            'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir),
            '-DcentralIndexDir={}'.format(central_index_dir), '-jar',
            'maven-index-checker.jar', '-c'
        ]
        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "cmd1", cmd))

        with cwd(maven_index_checker_dir):
            try:
                output = TimedCommand.get_command_output(cmd,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=10800)
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "output", output))

                current_count = output['count']
                new_timestamp = int(os.stat(timestamp_path).st_mtime)
                if old_timestamp != new_timestamp:
                    self.log.info('Storing pre-built maven index to S3...')
                    s3.store_index(maven_index_checker_data_dir)
                    self.log.debug('Stored. Index in S3 is up-to-date.')
                    if old_timestamp == 0:
                        s3.set_last_offset(current_count)
                        self.log.info(
                            'This is first run, i.e. all packages are considered new. '
                            'Skipping scheduling to not analyze all packages in index.'
                        )
                        return
                else:
                    self.log.info('Index in S3 is up-to-date.')

                self.log.debug(
                    "Number of entries in maven indexer: %d, "
                    "last offset used: %d", current_count, last_offset)
                to_schedule_count = current_count - last_offset
                if to_schedule_count == 0:
                    self.log.info("No new packages to schedule, exiting...")
                    return

                cmd = [
                    'java', '-Xmx768m',
                    '-Djava.io.tmpdir={}'.format(java_temp_dir),
                    '-DcentralIndexDir={}'.format(central_index_dir), '-jar',
                    'maven-index-checker.jar', '-r',
                    '0-{}'.format(to_schedule_count)
                ]
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "cmd2", cmd))

                output = TimedCommand.get_command_output(cmd,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=10800)
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "output", output))

            except TaskError as e:
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "TaskError", e))
                self.log.exception(e)
                raise
            finally:
                rmtree(central_index_dir)
                self.log.debug('central-index/ deleted')
                rmtree(java_temp_dir)

            self.log.info(
                "Found %d new packages to analyse, scheduling analyses...",
                len(output))
            for entry in output:
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "Running ingestion for", entry))
                self.run_selinon_flow(
                    'bayesianFlow', {
                        'ecosystem': 'maven',
                        'name': '{groupId}:{artifactId}'.format(**entry),
                        'version': entry['version'],
                        'recursive_limit': 0
                    })

        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "current_count",
                                                current_count))
        s3.set_last_offset(current_count)
        self.log.info(
            "All new maven releases scheduled for analysis, exiting..")