class BlackDuckRelease(object):
    """
    Release object consist of version string, unique identifier
    and `datetime.datetime` information when this particular version was released
    """
    @schema.input(SchemaRef("blackduck-release", "1-0-0"))
    def __init__(self, json_data, project):
        self._version = json_data['version']
        self._id = json_data['versionId']
        self._released_at = datetime.strptime(json_data['releasedOn'],
                                              "%Y-%m-%dT%H:%M:%S.%fZ")
        self._project = project

    @property
    def project(self):
        return self._project

    @property
    def version(self):
        """ Release version """
        return self._version

    @property
    def id(self):
        """ Unique identifier """
        return self._id

    @property
    def released_at(self):
        """ Release date time """
        return self._released_at
Example #2
0
class OSCryptoCatcherTask(BaseTask):
    _analysis_name = 'crypto_algorithms'
    description = "Runs oscryptocatcher tool for matching crypto algorithms"
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()

        results = {'status': 'unknown',
                   'summary': {},
                   'details': []}

        try:
            oscc = TimedCommand.get_command_output(['oscryptocatcher', '--subdir-in-result', cache_path],
                                                   graceful=False, is_json=True)

            self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc)
            results['details'] = oscc['details']
            results['summary'] = oscc['summary']
            results['status'] = 'success'
        except:
            results['status'] = 'error'

        return results
Example #3
0
class ComponentAnalyses(ResourceWithSchema):
    method_decorators = [login_required]

    schema_ref = SchemaRef('analyses_graphdb', '1-2-0')

    @staticmethod
    def get(ecosystem, package, version):
        if ecosystem == 'maven':
            package = MavenCoordinates.normalize_str(package)
        package = case_sensitivity_transform(ecosystem, package)
        result = get_analyses_from_graph(ecosystem, package, version)
        current_app.logger.warn("%r" % result)

        if result is not None:
            # Known component for Bayesian
            return result

        if os.environ.get("INVOKE_API_WORKERS", "") == "1":
            # Enter the unknown path
            server_create_analysis(ecosystem, package, version, api_flow=True, force=False, force_graph_sync=True)
            msg = "Package {ecosystem}/{package}/{version} is unavailable. The package will be available shortly,"\
                  " please retry after some time.".format(ecosystem=ecosystem, package=package, version=version)
            raise HTTPError(202, msg)
        else:
            server_create_analysis(ecosystem, package, version, api_flow=False, force=False, force_graph_sync=True)
            msg = "No data found for {ecosystem} Package {package}/{version}".format(ecosystem=ecosystem,
                                                                                     package=package, version=version)
            raise HTTPError(404, msg)
Example #4
0
class ComponentsInRange(ResourceWithSchema):
    schema_ref = SchemaRef('version_range_resolver', '1-0-0')

    def get(self, ecosystem):
        query = request.args.get('q')
        eco = Ecosystem.by_name(rdb.session, ecosystem)
        fetcher = CucosReleasesFetcher(eco, rdb.session)
        now = datetime.datetime.now()

        # Instantiate two different solvers, one using a custom fetcher to fetch
        # matching releases from Bayesian DB and the other one fetching from
        # upstream repositories.
        # The data from these two solvers then provide information as to:
        #   1) Which packages in the range we have already analysed and have information
        #        about
        #   2) Other packages from upstream repositories which match the version specification
        cucos_solver, solver = get_ecosystem_solver(eco, with_fetcher=fetcher),\
                               get_ecosystem_solver(eco)

        ours = cucos_solver.solve([query], all_versions=True)
        upstream = solver.solve([query], all_versions=True)

        ours_nums = set() if not ours else set(next(iter(ours.values())))
        upstreams_nums = set() if not upstream else set(
            next(iter(upstream.values())))

        return {
            'query': query,
            'detail': {
                'analysed': ours,
                'upstream': upstream,
                'difference': list(upstreams_nums - ours_nums)
            },
            'resolved_at': str(now)
        }
Example #5
0
class AnalysisBase(ResourceWithSchema):
    """Base class for different endpoints returning analyses."""
    schema_ref = SchemaRef('component_analyses', '1-1-3')

    def add_schema(self, response, status_code, method):
        """Overrides add_schema to be able to add component analyses schemas."""
        super().add_schema(response, status_code, method)
        if status_code == 200 and method == 'GET':
            for analysis_name, analysis in response.get('analyses',
                                                        {}).items():
                if analysis is not None and 'schema' in analysis:
                    analysis['schema'][
                        'url'] = PublishedSchemas.get_component_analysis_schema_url(
                            name=analysis['schema']['name'],
                            version=analysis['schema']['version'])
        return response

    def _parse_args(self):
        args = ['fields', 'debuginfo']
        arg_parser = reqparse.RequestParser()
        for arg in args:
            arg_parser.add_argument(arg, default='')
        parsed_args = arg_parser.parse_args()
        result = {k: parsed_args[k] for k in args}
        result['debuginfo'] = result['debuginfo'].lower() == 'true'
        return result

    def _get_projection(self, fields):
        projection = {}
        if fields:
            for f in fields.split(','):
                projection[f] = 1
        return projection or None

    def _do_analysis_projection(self, analysis, fields):
        pass

    def _inc_access_counter(self, analysis):
        analysis.access_count += 1
        rdb.session.commit()

    def _sanitize_result(self, result, debuginfo=False):

        result['_release'] = result.pop('release', None)
        if debuginfo:
            result['_audit'] = result.pop('audit', None)
        else:
            result.pop('id', None)
            result.pop('audit', None)
            result.pop('subtasks', None)
            # Do not show init task
            result.get('analyses', {}).pop('InitAnalysisFlow', None)
            for analysis in result.get('analyses', {}):
                if result['analyses'][analysis]:
                    result['analyses'][analysis].pop('_audit', None)

        return result
class LinguistTask(BaseTask):
    _analysis_name = 'languages'
    description = "GitHub's tool to figure out what language is used in code"
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def _parse_linguist(self, output):
        if not output:
            return None

        def extract_value(line):
            """ `language:   Python` -> `Python` """
            return line.split(':', 1)[1].strip()

        lines_matcher = re.compile('(\d+) lines \((\d+) sloc\)')
        m = lines_matcher.search(output[0])
        lines, sloc = 0, 0
        if m:
            lines, sloc = int(m.groups(1)[0]), int(m.groups(2)[0])
        tml = zip(['type', 'mime', 'language'],
                  [extract_value(l) for l in output[1:4]])
        data = dict(tml, lines=lines, sloc=sloc)
        return data

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        results = []
        cache_path = ObjectCache.get_from_dict(
            arguments).get_extracted_source_tarball()

        def worker(path):
            mime = TimedCommand.get_command_output(['file', path, '-b',
                                                    '-i']).pop()
            self.log.debug("%s mime = %s", path, mime)
            typ = TimedCommand.get_command_output(['file', path, '-b'])
            self.log.debug("%s filetype = %s", path, typ)

            linguist = None
            if 'charset=binary' not in mime:
                linguist = self._parse_linguist(
                    TimedCommand.get_command_output(['linguist', path]))
                self.log.debug("%s linguist output = %s", path, linguist)

            results.append({
                "type": typ,
                "output": linguist,
                "path": os.path.relpath(path, cache_path),
            })

        with ThreadPool(target=worker) as tp:
            for path in get_all_files_from(cache_path,
                                           path_filter=skip_git_files):
                tp.add_task(path)

        return {'summary': [], 'status': 'success', 'details': results}
class DigesterTask(BaseTask):
    _analysis_name = 'digests'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')
    description = 'Computes various digests of all files found in target cache path'

    def compute_ssdeep(self, target):
        """ Compute SSdeep piece-wise linear hash of target """
        # 0 : ssdeep header
        # 1 : hash,filename
        data = TimedCommand.get_command_output(['ssdeep', '-c', '-s', target])
        try:
            return data[1].split(',')[0].strip()
        except IndexError:
            self.log.error("unable to compute ssdeep of %r", target)
            raise RuntimeError("can't compute digest of %r" % target)

    def compute_digests(self, cache_path, f, artifact=False):
        f_digests = {
            'sha256': compute_digest(f, 'sha256'),
            'sha1': compute_digest(f, 'sha1'),
            'md5': compute_digest(f, 'md5'),
            'ssdeep': self.compute_ssdeep(f)
        }

        if artifact:
            f_digests['artifact'] = True
            f_digests['path'] = os.path.basename(f)
        else:
            f_digests['path'] = os.path.relpath(f, cache_path)

        return f_digests

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        epv_cache = ObjectCache.get_from_dict(arguments)
        cache_path = epv_cache.get_extracted_source_tarball()

        results = []
        for f in get_all_files_from(cache_path, path_filter=skip_git_files):
            results.append(self.compute_digests(cache_path, f))

        # In case of nodejs, prior to npm-2.x.x (Fedora 24)
        # npm client was repackaging modules on download.
        # It modified file permissions inside package.tgz so they matched UID/GID
        # of a user running npm command. Therefore its digest was different
        # then of a tarball downloaded directly from registry.npmjs.org.
        source_tarball_path = epv_cache.get_source_tarball()
        results.append(
            self.compute_digests(source_tarball_path,
                                 source_tarball_path,
                                 artifact=True))

        return {'summary': [], 'status': 'success', 'details': results}
 def test_schema_lookup(self, tmpdir):
     library = SchemaLibrary(str(tmpdir))
     requested_schema = SchemaRef("example", "1-0-0")
     with pytest.raises(SchemaLookupError):
         library.load_schema(requested_schema)
     schema_path = tmpdir.join("example-v1-0-0.schema.json")
     dummy_schema = {"dummy-schema": "example"}
     serialized_schema = json.dumps(dummy_schema).encode('utf-8')
     schema_path.write_binary(serialized_schema)
     assert library.read_binary_schema(
         requested_schema) == serialized_schema
     assert library.load_schema(requested_schema) == dummy_schema
Example #9
0
class StackAnalysesByGraphGET(ResourceWithSchema):
    method_decorators = [login_required]

    schema_ref = SchemaRef('stack_analyses', '2-1-4')

    @staticmethod
    def get(external_request_id):
        try:
            results = rdb.session.query(WorkerResult)\
                                 .filter(WorkerResult.external_request_id == external_request_id,
                                         or_(WorkerResult.worker == "stack_aggregator",
                                             WorkerResult.worker == "recommendation"))
            if results.count() <= 0:
                raise HTTPError(
                    202, "Analysis for request ID '{t}' is in progress".format(
                        t=external_request_id))
        except SQLAlchemyError:
            raise HTTPError(
                500,
                "Worker result for request ID '{t}' doesn't exist yet".format(
                    t=external_request_id))

        try:
            recommendation_result = {}
            audit = ""
            external_request_id = ""
            manifest_response = []

            for row in results:
                result = row.to_dict()
                if result["worker"] == "stack_aggregator":
                    audit = result["task_result"]["_audit"]
                    external_request_id = result["external_request_id"]
                    manifest_response.append(result["task_result"])
                else:
                    recommendation_result = {
                        "recommendations":
                        result["task_result"]["recommendations"]
                    }

            response = {
                "started_at": audit["started_at"],
                "finished_at": audit["ended_at"],
                "request_id": external_request_id,
                "result": manifest_response,
                "recommendation": recommendation_result
            }
            return response
        except:
            raise HTTPError(
                500, "Error creating response for request {t}".format(
                    t=external_request_id))
 def test_bundled_schema_lookup(self, tmpdir):
     pkgdir = tmpdir.mkdir(tmpdir.basename)
     pkgdir.ensure("__init__.py")
     schemadir = pkgdir.mkdir("schemas")
     module = pkgdir.pyimport()
     library = BundledSchemaLibrary("schemas", module.__name__)
     requested_schema = SchemaRef("example", "1-0-0")
     with pytest.raises(SchemaLookupError):
         library.load_schema(requested_schema)
     schema_path = schemadir.join("example-v1-0-0.schema.json")
     dummy_schema = {"dummy-schema": "example"}
     serialized_schema = json.dumps(dummy_schema).encode('utf-8')
     schema_path.write_binary(serialized_schema)
     assert library.read_binary_schema(
         requested_schema) == serialized_schema
     assert library.load_schema(requested_schema) == dummy_schema
class LicenseCheckTask(BaseTask):
    _analysis_name = 'source_licenses'
    description = "Check licences of all files of a package"
    schema_ref = SchemaRef(_analysis_name, '2-0-0')

    def execute(self, arguments):
        """
        task code

        :param arguments: dictionary with arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception as e:
            eco = arguments.get('ecosystem')
            pkg = arguments.get('name')
            ver = arguments.get('version')
            if arguments['ecosystem'] != 'maven':
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = {'status': 'unknown', 'summary': {}, 'details': {}}
        try:
            result_data['details'] = TimedCommand.get_command_output(
                ['license_check.py', cache_path], graceful=False, is_json=True)
            result_data['status'] = result_data['details'].pop('status')
            result_data['summary'] = result_data['details'].pop('summary')
        except:
            self.log.exception("License scan failed")
            result_data['status'] = 'error'

        return result_data
Example #12
0
class BinwalkTask(BaseTask):
    _analysis_name = 'binary_data'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')
    description = "Find and extract interesting files / data from binary images"

    def parse_binwalk(self, output):
        if not output:
            return None
        import re
        matcher = re.compile('^\d{,8}\s*0x[A-Fa-f0-9]{,8}\s*(.*)$')
        matched = []
        for line in output:
            match = matcher.match(line)
            if match:
                matched.append(match.groups(1)[0])
        return matched

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball()

        results = []
        for path in get_all_files_from(cache_path, path_filter=skip_git_files):
            self.log.debug("path = %s", path)

            bw = TimedCommand(['binwalk', '-B', path])
            status, output, error = bw.run(timeout=60)
            self.log.debug("status = %s, error = %s", status, error)
            self.log.debug("output = %s", output)

            parsed_binwalk = self.parse_binwalk(output)
            results.append({
                "path": os.path.relpath(path, cache_path),
                "output": parsed_binwalk,
            })
        return {'summary': [], 'status': 'success', 'details': results}
Example #13
0
class StackAnalysesByGraphGET(ResourceWithSchema):
    method_decorators = [login_required]
    schema_ref = SchemaRef('stack_analyses', '2-1-4')

    @staticmethod
    def get(external_request_id):
        stack_result = retrieve_worker_result(rdb, external_request_id, "stack_aggregator")
        reco_result = retrieve_worker_result(rdb, external_request_id, "recommendation")

        if stack_result is None and reco_result is None:
            raise HTTPError(202, "Analysis for request ID '{t}' is in progress".format(t=external_request_id))

        if stack_result == -1 and reco_result == -1:
            raise HTTPError(404, "Worker result for request ID '{t}' doesn't exist yet".format(t=external_request_id))

        started_at = None
        finished_at = None
        manifest_response = []
        recommendation = {}

        if stack_result != None and 'task_result' in stack_result:
            if stack_result["task_result"] != None:
                started_at = stack_result["task_result"]["_audit"]["started_at"]
                finished_at = stack_result["task_result"]["_audit"]["ended_at"]
                manifest_response.append(stack_result["task_result"])

        if reco_result is not None and 'task_result' in reco_result:
            if reco_result["task_result"] != None:
                recommendation = reco_result['task_result']['recommendations']

        return {
            "started_at": started_at,
            "finished_at": finished_at,
            "request_id": external_request_id,
            "result": manifest_response,
            "recommendation": recommendation
        }
class BlackDuckProject(object):
    """
    Project contains information about specific {ecosystem}-{package} pair
    """
    @schema.input(SchemaRef("blackduck-project", "1-0-0"))
    def __init__(self, json_data):
        self._source = json_data
        self._name = json_data['name']
        self._id = json_data['id']
        self._canonical_release_id = json_data['canonicalReleaseId']
        self._urls = {k: v for k, v in json_data.items() if k.endswith('Url')}

    @property
    def name(self):
        """ Name of the project """
        return self._name

    @property
    def id(self):
        """ Unique identifier of the project """
        return self._id

    @property
    def urls(self):
        """ Flat list of additional URLs for this project """
        return self._urls

    @property
    def canonical_release_id(self):
        """ Latest release for the given project (in terms of version number) """
        return self._canonical_release_id

    @property
    def source(self):
        """ Source JSON from which this object was parsed """
        return self._source
 def test_bundled_dynamic_schema_lookup(self, tmpdir, monkeypatch):
     pkgdir = tmpdir.mkdir(tmpdir.basename)
     pkgdir.ensure("__init__.py")
     schemadir = pkgdir.mkdir("schemas")
     schemadir.ensure("__init__.py")
     library = BundledDynamicSchemaLibrary('.'.join(
         [tmpdir.basename, "schemas"]))
     schema1 = SchemaRef("example", "1-0-0")
     schema2 = SchemaRef("example2", "1-0-0")
     schema3 = SchemaRef("example3", "1-0-0")
     schema4 = SchemaRef("example4", "1-0-0")
     schema5 = SchemaRef("example4", "2-0-0")  # intentionally example4
     schema6 = SchemaRef("example6", "2-0-0")
     with pytest.raises(SchemaImportError):
         library.load_schema_class_and_role(schema1)
     # sch2 doesn't have the ROLE_v1_0_0 variable
     sch2 = "import jsl;\nclass Schema(jsl.Document):\n x = jsl.StringField()\n"
     # sch3 doesn't have THE_SCHEMA variable
     sch3 = sch2 + "\nROLE_v1_0_0 = 'v1-0-0'\n"
     # sch4 is ok
     sch4 = sch3 + "\nTHE_SCHEMA = Schema\n"
     # no sch5; sch6 is ok and has two roles
     sch6 = sch4 + "\nROLE_v2_0_0 = 'v2-0-0'\n"
     schemadir.join("example2.py").write(sch2)
     schemadir.join("example3.py").write(sch3)
     schemadir.join("example4.py").write(sch4)
     schemadir.join("example6.py").write(sch6)
     monkeypatch.syspath_prepend(pkgdir.dirname)
     with pytest.raises(SchemaModuleAttributeError):
         library.load_schema_class_and_role(schema2)
     with pytest.raises(SchemaModuleAttributeError):
         library.load_schema_class_and_role(schema3)
     klass, role = library.load_schema_class_and_role(schema4)
     assert "x" in dir(klass)
     assert role == "v1-0-0"
     with pytest.raises(SchemaModuleAttributeError):
         # example 5 is the same as example 4, but doesn't have the required version 2-0-0
         library.load_schema_class_and_role(schema5)
     klass6, role6 = library.load_schema_class_and_role(schema6)
     assert "x" in dir(klass)
     assert role6 == "v2-0-0"
class DownstreamUsageTask(BaseTask):
    """Queries Red Hat's internal toolchain for downstream component usage

    - queries Anitya for downstream package names
    - uses the package name and component version to query:
      - Brew for internal SRPM and build details
      - the Pulp CDN for redistribution details
    """
    _analysis_name = 'redhat_downstream'
    description = 'Queries Red Hat internal toolchain for downstream usage'
    schema_ref = SchemaRef(_analysis_name, '2-2-1')

    _backend_to_anitya_ecosystem = {
        EcosystemBackend.npm: 'npm',
        EcosystemBackend.maven: 'maven',
        EcosystemBackend.pypi: 'pypi',
        EcosystemBackend.rubygems: 'rubygems'
    }

    _ecosystem_to_prefix = {
        'npm': 'nodejs',
        'pypi': 'python',
        'rubygems': 'rubygem'
    }

    # Give CLI 10 minutes to retrieve results
    _BREWUTILS_CLI_TIMEOUT = 600

    def _get_artifact_hash(self, algorithm=None):
        wr = self.parent_task_result('digests')
        if wr:
            for details in wr['details']:
                if details.get('artifact'):
                    return details[algorithm or 'md5']
        return None

    @staticmethod
    def _prefix_package_name(name, ecosystem):
        prefix = DownstreamUsageTask._ecosystem_to_prefix.get(ecosystem, '')
        if prefix:
            return '{p}-{n}'.format(p=prefix, n=name)

        return name

    def _fetch_anitya_project(self, ecosystem, package):
        eco_model = self.storage.get_ecosystem(ecosystem)
        backend = self._backend_to_anitya_ecosystem.get(eco_model.backend, None)
        if backend is None:
            raise ValueError('Don\'t know how to add ecosystem {e} with backend {b} to Anitya'.
                             format(e=ecosystem, b=eco_model.backend))
        api_path = '/api/by_ecosystem/{e}/{p}/'.format(e=ecosystem, p=package)
        anitya_url = config.anitya_url
        try:
            return _query_anitya_url(anitya_url, api_path)
        except (requests.HTTPError, requests.ConnectionError):
            msg = 'Failed to contact Anitya server at {}'
            self.log.exception(msg.format(config.anitya_url))
        return None

    def _get_cdn_metadata(self, srpm_filename):
        """Try to retrieve Pulp CDN metadata"""
        try:
            pulp = Pulp()
        except ValueError as e:
            self.log.error(e)
            return None
        try:
            metadata = pulp.get_cdn_metadata_for_srpm(srpm_filename)
        except Exception as e:
            self.log.exception(e)
            return None
        return metadata

    def _add_mvn_results(self, result_summary, anitya_mvn_names, version):
        def _compare_version(downstream, upstream):
            dv = downstream
            if 'redhat' in dv:
                # remove ".redhat-X" or "-redhat-X" suffix
                dv = dv[:dv.find('redhat')-1]
            if dv == upstream:
                return True
            else:
                return False

        downstream_rebuilds = []

        for name in anitya_mvn_names:
            metadata_url = '{repo}/{pkg}/maven-metadata.xml'.format(repo=RH_MVN_GA_REPO,
                                                                    pkg=mvn_pkg_to_repo_path(name))
            res = requests.get(metadata_url)
            if res.status_code != 200:
                self.log.info('Metadata for package {pkg} not found in {repo} (status {code})'.
                              format(pkg=name, repo=RH_MVN_GA_REPO, code=res.status_code))
                continue
            versions = anymarkup.parse(res.text)['metadata']['versioning']['versions']['version']
            # make sure 'versions' is a list (it's a string if there is just one version)
            if not isinstance(versions, list):
                versions = [versions]
            self.log.info('Found versions {v} for package {p}'.format(v=versions, p=name))
            for v in versions:
                if _compare_version(v, version):
                    downstream_rebuilds.append(v)

        result_summary['rh_mvn_matched_versions'] = downstream_rebuilds
        if downstream_rebuilds:
            # For now, we don't distinguish products, we just use general "Middleware"
            #  for all Maven artifacts
            result_summary['all_rhsm_product_names'].append('Middleware')

    @staticmethod
    def _is_inside_rh():
        """Returns True if running on RH network, False otherwise."""
        is_inside = False
        try:
            is_inside = int(os.environ.get("OPENSHIFT_DEPLOYMENT", 0)) == 0
        except ValueError:
            pass
        return is_inside

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        eco = arguments['ecosystem']
        pkg = arguments['name']
        tool_responses = {}
        result_summary = {
            'package_names': [],
            'registered_srpms': [],
            'all_rhn_channels': [],
            'all_rhsm_content_sets': [],
            'all_rhsm_product_names': []
        }
        result_data = {'status': 'error',
                       'summary': result_summary,
                       'details': tool_responses
                       }

        # bail out early; we need access to internal services or the package is from Maven ecosystem,
        # otherwise we can't comment on downstream usage
        is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven)
        if not self._is_inside_rh() and not is_maven:
            return result_data

        self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg))
        res = self._fetch_anitya_project(eco, pkg)
        anitya_rpm_names = []
        anitya_mvn_names = []
        if res is None:
            result_data['status'] = 'error'
        elif res.status_code == 200:
            self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg))
            anitya_response = res.json()
            tool_responses['redhat_anitya'] = anitya_response
            # For now, we assume all downstreams are ones we care about
            for entry in anitya_response['packages']:
                if entry['distro'] == RH_RPM_DISTRO_NAME:
                    anitya_rpm_names.append(entry['package_name'])
                elif entry['distro'] == RH_MVN_DISTRO_NAME:
                    anitya_mvn_names.append(entry['package_name'])
                else:
                    self.log.warning(
                        'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'.
                                     format(d=entry['distro'], o=entry['package_name'], p=pkg)
                    )
            self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names))
            self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names))
            # TODO: Report 'partial' here and switch to 'success' at the end
            result_data['status'] = 'success'
        else:
            msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}'
            self.log.error(msg.format(e=eco, p=pkg, r=res.text))
            result_data['status'] = 'error'

        if self._is_inside_rh():
            # we have candidate downstream name mappings, check them against Brew
            seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)]
            self.log.debug('Checking candidate names in Brew: {}'.format(seed_names))

            args = ['brew-utils-cli', '--version', arguments['version']]
            artifact_hash = self._get_artifact_hash(algorithm='sha256')
            if artifact_hash:
                args += ['--digest', artifact_hash]
            args += seed_names

            self.log.debug("Executing command, timeout={timeout}: {cmd}".format(timeout=self._BREWUTILS_CLI_TIMEOUT,
                                                                                cmd=args))
            tc = TimedCommand(args)
            status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT)
            self.log.debug("status = %s, error = %s", status, error)
            output = ''.join(output)
            self.log.debug("output = %s", output)
            if not output:
                raise TaskError("Error running command %s" % args)
            brew = json.loads(output)

            result_summary['package_names'] = brew['packages']
            result_summary['registered_srpms'] = brew['response']['registered_srpms']
            tool_responses['brew'] = brew['response']['brew']

            # we have SRPM details, fetch details on where the RPMs are shipped
            tool_responses['pulp_cdn'] = pulp_responses = []
            rhn_channels = set()
            rhsm_content_sets = set()
            rhsm_product_names = set()
            for srpm_summary in result_summary['registered_srpms']:
                srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'],
                                                             v=srpm_summary['version'],
                                                             r=srpm_summary['release'])
                cdn_metadata = self._get_cdn_metadata(srpm_filename)
                if cdn_metadata is None:
                    msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}'
                    self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename))
                    continue
                pulp_responses.append(cdn_metadata)
                srpm_summary['published_in'] = cdn_metadata['rhsm_product_names']
                rhn_channels.update(cdn_metadata['rhn_channels'])
                rhsm_content_sets.update(cdn_metadata['rhsm_content_sets'])
                rhsm_product_names.update(cdn_metadata['rhsm_product_names'])
            result_summary['all_rhn_channels'] = sorted(rhn_channels)
            result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets)
            result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names)

        self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version'])

        return result_data
Example #17
0
class DependencySnapshotTask(BaseTask):
    _analysis_name = 'dependency_snapshot'
    description = 'Task that analyzes dependencies'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def _collect_dependencies(self):
        """
        Return all dependencies for current analysis flow (operates on parent mercator result)

        :return: List[str], list of dependencies
        """
        wr = self.parent_task_result('metadata')
        if not isinstance(wr, dict):
            raise TaskError('metadata task result has unexpected type: {}; expected dict'.
                            format(type(wr)))

        # there can be details about multiple manifests in the metadata, therefore we will collect dependency
        # specifications from all of them and exclude obvious duplicates along the way
        dependencies = list({dep for m in wr.get('details', []) if m.get('dependencies')
                             for dep in m.get('dependencies', [])})
        return dependencies

    def _resolve_dependency(self, ecosystem, dep):
        ret = {'ecosystem': ecosystem.name,
               'declaration': dep,
               'resolved_at': json_serial(datetime.datetime.now())}

        # first, if this is a Github dependency, return it right away (we don't resolve these yet)
        if ' ' in dep:
            # we have both package name and version (version can be an URL)
            name, spec = dep.split(' ', 1)
            if gh_dep.match(spec):
                ret['name'] = name
                ret['version'] = 'https://github.com/' + spec
            elif urllib.parse.urlparse(spec).scheme is not '':
                ret['name'] = name
                ret['version'] = spec
        else:
            if gh_dep.match(dep):
                ret['name'] = 'https://github.com/' + dep
                ret['version'] = None
            elif urllib.parse.urlparse(dep).scheme is not '':
                ret['name'] = dep
                ret['version'] = None

        if 'name' in ret:
            return ret

        # second, figure out what is the latest upstream version matching the spec and return it
        solver = get_ecosystem_solver(ecosystem)
        pkgspec = solver.solve([dep])

        if not pkgspec:
            raise TaskError("invalid dependency: {}".format(dep))

        package, version = pkgspec.popitem()
        if not version:
            raise TaskError("bad version resolved for {}".format(dep))

        ret['name'] = package
        ret['version'] = version
        return ret

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))

        result = {'summary': {'errors': [], 'dependency_counts': {}},
                  'status': 'success', 'details': {}}
        ecosystem = self.storage.get_ecosystem(arguments.get('ecosystem'))
        try:
            deps = self._collect_dependencies()
        except TaskError as e:
            self.log.error(str(e))
            result['summary']['errors'].append(str(e))
            result['status'] = 'error'
            return result

        resolved_deps = []
        for dep in deps:
            try:
                resolved = self._resolve_dependency(ecosystem, dep)
            except TaskError as e:
                self.log.error(str(e))
                result['summary']['errors'].append(str(e))
                result['status'] = 'error'
            self.log.info('resolved dependency %s as %s', resolved, dep)
            resolved_deps.append(resolved)
        # in future, we may want to provide also build/test dependencies, not just runtime
        result['details']['runtime'] = resolved_deps
        result['summary']['dependency_counts']['runtime'] = len(resolved_deps)
        return result
Example #18
0
class StackAnalysesById(ResourceWithSchema):
    schema_ref = SchemaRef('stack_analyses', '2-1-3')

    def get(self, external_request_id):
        submitted_at = ""
        manifest_appstackid_map = {}
        try:
            results = rdb.session.query(StackAnalysisRequest)\
                                 .filter(StackAnalysisRequest.id == external_request_id)
            if results.count() <= 0:
                raise HTTPError(
                    404, "Invalid request ID '{id}' received".format(
                        id=external_request_id))

            row = results.first().to_dict()
            submitted_at = row["submitTime"]
            request_json = json.loads(row["requestJson"])

            for manifest in request_json["manifest"]:
                if manifest.get('appstack_id', 0):
                    manifest_appstackid_map[
                        manifest["filename"]] = manifest["appstack_id"]

        except SQLAlchemyError:
            raise HTTPError(
                500, "Error fetching data for request ID '{id}'".format(
                    id=external_request_id))

        try:
            results = rdb.session.query(WorkerResult)\
                                 .filter(WorkerResult.external_request_id == external_request_id,
                                         WorkerResult.worker == "dependency_aggregator")
            if results.count() <= 0:
                raise HTTPError(
                    202, "Analysis for request ID '{t}' is in progress".format(
                        t=external_request_id))
        except SQLAlchemyError:
            raise HTTPError(
                500,
                "Worker result for request ID '{t}' doesn't exist yet".format(
                    t=external_request_id))

        try:
            if results.count() > 0:
                result = results.first().to_dict()
                audit = result["task_result"]["_audit"]
                manifest_response = []

                # TODO: this will probably need some refactoring

                for manifest in result["task_result"]["result"]:
                    for component in manifest["components"]:
                        component["latest_version"] = safe_get_latest_version(
                            component["ecosystem"], component["name"])
                        component["dependents_count"] = get_dependents_count(
                            component["ecosystem"], component["name"],
                            component["version"], rdb.session)
                        component["relative_usage"] = usage_rank2str(
                            get_component_percentile_rank(
                                component["ecosystem"], component["name"],
                                component["version"], rdb.session))
                    manifest_appstack_id = manifest_appstackid_map.get(
                        manifest["manifest_name"], '')
                    if manifest_appstack_id != '':
                        url = current_app.config['BAYESIAN_ANALYTICS_URL']
                        endpoint = "{analytics_baseurl}/api/v1.0/recommendation/{appstack_id}".format(
                            analytics_baseurl=url,
                            appstack_id=manifest_appstack_id)
                        resp = requests.get(endpoint)
                        if resp.status_code == 200:
                            recommendation = resp.json()

                            # Adding URI of the stacks to the recommendation
                            if recommendation.get("input_stack", {}).get(
                                    "appstack_id", "") != "":
                                recommendation["input_stack"][
                                    "uri"] = "{analytics_baseurl}/api/v1.0/appstack/{appstack_id}".format(
                                        analytics_baseurl=url,
                                        appstack_id=recommendation[
                                            "input_stack"]["appstack_id"])

                            if recommendation.get("recommendations", {}).get(
                                    "similar_stacks", "") != "":
                                for r in recommendation["recommendations"][
                                        "similar_stacks"]:
                                    if r["stack_id"] != "":
                                        r["uri"] = "{analytics_baseurl}/api/v1.0/appstack/{appstack_id}".format(
                                            analytics_baseurl=url,
                                            appstack_id=r["stack_id"])
                            manifest["recommendation"] = recommendation
                        else:
                            current_app.logger.warn("{status}: {error}".format(
                                status=resp.status_code, error=resp.content))

                    manifest_response.append(manifest)
                response = {
                    "status": result["task_result"]["status"],
                    "submitted_at": submitted_at,
                    "started_at": audit["started_at"],
                    "finished_at": audit["ended_at"],
                    "request_id": result["external_request_id"],
                    "result": manifest_response
                }
                return response
        except:
            raise HTTPError(
                500, "Error creating response for request {t}".format(
                    t=external_request_id))
Example #19
0
class BlackDuckTask(BaseTask):
    _analysis_name = 'blackduck'
    description = 'Scan the package using Black Duck'
    _valid_ecosystems = ["npm", "maven", "pypi"]
    _allow_cli_scan = True
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    _BLACKDUCK_CLI_TIMEOUT = 600

    def _format_hub_url(self):
        """
        Format Hub connection string from supplied config

        :return:
        """
        return "{scheme}://{host}:{port}/".format(
            scheme=config.blackduck_scheme,
            host=config.blackduck_host,
            port=config.blackduck_port)

    def _is_valid_ecosystem(self, ecosystem_id):
        """
        Determine whether the given ecosystem is valid for
        Black Duck analysis

        :param ecosystem_id: int, the ID of the ecosystem
        :return: bool
        """
        return ecosystem_id in self._valid_ecosystems

    def _find_blackduck_cli_root(self):
        """
        Find the base directory where the BlackDuck CLI got
        extracted

        :return: str, path to the CLI root
        """
        base = config.blackduck_path
        dirs = listdir(base)
        if not dirs:
            raise TaskError("Unable to find BlackDuck CLI directory")
        if len(dirs) > 1:
            raise TaskError("More than 1 BlackDuck CLI directory")

        return path.join(base, dirs.pop())

    def _prepare_command(self, project, version, archive):
        """
        Prepare the necessary CLI parameters

        :param project: str, name of the project
        :param version: str, version of the release
        :param archive: str, path to the archive with the sources
        :return: List[str], command list ready to be run
        """

        binary = "{base}/{rel}".format(base=self._find_blackduck_cli_root(),
                                       rel="bin/scan.cli.sh")

        return [
            binary, "--host", config.blackduck_host, "--port",
            str(int(config.blackduck_port)), "--scheme",
            config.blackduck_scheme, "--username", config.blackduck_username,
            "--project", project, "--release", version, archive
        ]

    def _get_release(self, hub, project, version):
        """
        Get release ID for given project version

        :param hub: BlackDuckHub, hub object to use
        :param project: str, name of the project
        :param version: str, version
        :return: BlackDuckRelease object or None if not found
        """
        # check that the specified project exists
        proj = hub.find_project(project)
        if not proj:
            return None

        # check that we have the proper version
        releases = hub.get_releases(proj)
        return releases.get(version, None)

    def _release_data(self, hub, project, version):
        """
        Fetch release data for the given project and version

        :param hub: BlackDuckHub, hub object to use
        :param project: str, name of the project
        :param version: str, version
        :return: dict, BoM information about the release
        """
        release = self._get_release(hub, project, version)
        if release is None:
            return None
        return hub.get_release_bom_json(release)

    def _get_hub(self):
        # connect to the Black Duck Hub
        hub_url = self._format_hub_url()
        self.log.debug("hub url: {url}".format(url=hub_url))
        hub = BlackDuckHub(hub_url)
        hub.connect_session(config.blackduck_username,
                            config.blackduck_password)
        return hub

    def _get_project_name(self, arguments):
        return "{ecosystem}-{package}".format(ecosystem=arguments['ecosystem'],
                                              package=arguments['name'])

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        result_data = {'status': 'unknown', 'summary': [], 'details': {}}

        if self._is_valid_ecosystem(arguments['ecosystem']):
            hub = self._get_hub()

            # BlackDuck project doesn't have a notion of ecosystem, so we need to
            # namespace the project names ourselves, so for package `crumb` in the NPM ecosystem
            # we'll end up with the name `npm-crumb`
            project = self._get_project_name(arguments)
            version = arguments['version']

            # Check if the given project had already been scanned
            data = self._release_data(hub, project, version)

            if not data and self._allow_cli_scan:
                self.log.debug("No data available for project {p} {v}".format(
                    p=project, v=version))
                # No data available, issue a new scan and re-query release data
                source_tarball_path = ObjectCache.get_from_dict(
                    arguments).get_source_tarball()
                command = self._prepare_command(project, version,
                                                source_tarball_path)
                self.log.debug(
                    "Executing command, timeout={timeout}: {cmd}".format(
                        timeout=self._BLACKDUCK_CLI_TIMEOUT, cmd=command))
                bd = TimedCommand(command)
                status, output, error = bd.run(
                    timeout=self._BLACKDUCK_CLI_TIMEOUT,
                    update_env={'BD_HUB_PASSWORD': config.blackduck_password})
                self.log.debug("status = %s, error = %s", status, error)
                self.log.debug("output = %s", output)
                data = self._release_data(hub, project, version)

            self.log.debug("Release data for project {p} {v}: {d}".format(
                p=project, v=version, d=data))
            result_data['details'] = data
            result_data['status'] = 'success' if data else 'error'
        else:
            result_data['status'] = 'error'

        return result_data
Example #20
0
class StackAnalysesByGraph(ResourceWithSchema):
    schema_ref = SchemaRef('stack_analyses', '2-1-4')

    def post(self):
        session = FuturesSession()
        files = request.files.getlist('manifest[]')
        dt = datetime.datetime.now()
        origin = request.form.get('origin')

        # At least one manifest file should be present to analyse a stack
        if len(files) <= 0:
            return jsonify(
                error=
                "Error processing request. Please upload a valid manifest files."
            )

        request_id = uuid.uuid4().hex
        manifests = []
        stack_data = {}
        result = []
        for f in files:
            filename = f.filename

            # check if manifest files with given name are supported
            manifest_descriptor = get_manifest_descriptor_by_filename(filename)
            if manifest_descriptor is None:
                return jsonify(
                    error="Manifest file '{filename}' is not supported".format(
                        filename=filename))

            content = f.read().decode('utf-8')

            # In memory file to be passed as an API parameter to /appstack
            manifest_file = StringIO(content)

            # Check if the manifest is valid
            if not manifest_descriptor.validate(content):
                return jsonify(
                    error=
                    "Error processing request. Please upload a valid manifest file '{filename}'"
                    .format(filename=filename))

            # Limitation: Currently, appstack can support only package.json
            # Record the response details for this manifest file
            manifest = {
                'filename': filename,
                'content': content,
                'ecosystem': manifest_descriptor.ecosystem
            }
            manifests.append(manifest)
            if 'package.json' in filename:
                substr = []
                # Read package contents
                packagejson = json.loads(content)
                appstack_file = {'packagejson': manifest_file}
                url = current_app.config["BAYESIAN_ANALYTICS_URL"]
                analytics_url = "{analytics_baseurl}/api/v1.0/recommendation".format(
                    analytics_baseurl=url)

                urls = [
                    analytics_url,
                    current_app.config["GREMLIN_SERVER_URL_REST"]
                ]
                # call recommendation api asynchronously
                try:
                    reco_req = session.post(urls[0],
                                            files=appstack_file,
                                            timeout=None)
                except Exception as exc:
                    current_app.logger.warn("Analytics query: {}".format(exc))
                # carry on with further processing
                for pkg, ver in packagejson['dependencies'].items():
                    substr.append("has('pecosystem','NPM').has('pname','" +
                                  pkg + "').has('version','" + ver + "')")
                substr1 = ",".join(substr)
                str_gremlin = "g.V().or(" + substr1 + ").valueMap(true);"
                payload = {'gremlin': str_gremlin}
                # call graph endpoint to fetch attributes asynchronously
                graph_req = session.post(urls[1], data=json.dumps(payload))
                #wait for all request to process

                graph_resp = graph_req.result()
                stack_data = aggregate_stack_data(graph_resp.json(), filename,
                                                  "npm")  #Hardcoded to NPM
                #Get Recommendation API result
                reco_resp = reco_req.result()
                reco_json = reco_resp.json()
                stack_data['recommendation'] = reco_json
                result.append(stack_data)

        # Store the Request in DB
        try:
            req = StackAnalysisRequest(id=request_id,
                                       submitTime=str(dt),
                                       requestJson={'manifest': manifests},
                                       origin=origin,
                                       result={'result': result})
            rdb.session.add(req)
            rdb.session.commit()
        except SQLAlchemyError:
            current_app.logger.exception(
                'Failed to create new analysis request')
            raise HTTPError(
                500,
                "Error inserting log for request {t}".format(t=request_id))

        response = {
            'status': 'success',
            'request_id': request_id,
            'result': result
        }
        return (response)
Example #21
0
class MercatorTask(BaseTask):
    _analysis_name = 'metadata'
    _dependency_tree_lock = '_dependency_tree_lock'
    description = 'Collects `Release` specific information from Mercator'
    schema_ref = SchemaRef(_analysis_name, '3-1-1')
    _data_normalizer = DataNormalizer()

    def _parse_requires_txt(self, path):
        requires = []
        try:
            with open(path, 'r') as f:
                for l in f.readlines():
                    l = l.strip()
                    if l.startswith('['):
                        # the first named ini-like [section] ends the runtime requirements
                        break
                    elif l:
                        requires.append(l)
        except Exception as e:
            self.log.warning('Failed to process "{p}": {e}'.format(p=path,
                                                                   e=str(e)))

        return requires

    def _merge_python_items(self, topdir, data):
        metadata_json = None
        pkg_info = None
        requirements_txt = None

        def get_depth(path):
            return path.rstrip('/').count('/')

        def is_deeper(item1, item2):
            """ Returns True if item1 is deeper in directory hierarchy than item2 """
            if item1 is None:
                return True
            return get_depth(item1['path']) > get_depth(item2['path'])

        # find outermost PKG_INFO/metadata.json/requirements.txt - there can be
        #  testing ones etc.
        for item in data['items']:
            if item['ecosystem'] == 'Python-Dist' and item['path'].endswith(
                    '.json'):
                if is_deeper(metadata_json, item):
                    metadata_json = item
            elif item['ecosystem'] == 'Python-Dist':  # PKG-INFO
                # we prefer PKG_INFO files from .egg-info directories,
                #  since these have the very useful `requires.txt` next to them
                if pkg_info is None:
                    pkg_info = item
                else:
                    pkg_info_in_egg = pkg_info['path'].endswith(
                        '.egg-info/PKG-INFO')
                    item_in_egg = item['path'].endswith('.egg-info/PKG-INFO')
                    # rather than one insane condition, we use several less complex ones
                    if pkg_info_in_egg and item_in_egg and is_deeper(
                            pkg_info, item):
                        # if both are in .egg-info, but current pkg_info is deeper
                        pkg_info = item
                    elif item_in_egg and not pkg_info_in_egg:
                        # if item is in .egg-info and current pkg_info is not
                        pkg_info = item
                    elif not (item_in_egg or pkg_info_in_egg) and is_deeper(
                            pkg_info, item):
                        # if none of them are in .egg-info, but current pkg_info is deeer
                        pkg_info = item
            elif item['ecosystem'] == 'Python-RequirementsTXT' and is_deeper(
                    pkg_info, item):
                requirements_txt = item

        if pkg_info:
            self.log.info('Found PKG-INFO at {p}'.format(p=pkg_info['path']))
        if metadata_json:
            self.log.info(
                'Found metadata.json at {p}'.format(p=metadata_json['path']))
        if requirements_txt:
            self.log.info('Found requirements.txt at {p}'.format(
                p=requirements_txt['path']))

        ret = None
        # figure out if this was packaged as wheel => metadata.json would
        #  have depth of topdir + 2
        if metadata_json and get_depth(
                metadata_json['path']) == get_depth(topdir) + 2:
            self.log.info('Seems like this is wheel, using metadata.json ...')
            ret = metadata_json
        # figure out if this was packaged as sdist => PKG_INFO would
        #  have depth of topdir + 2 or topdir + 3
        #  (and perhaps there are requires.txt or requirements.txt that we could use)
        # NOTE: for now, we always treat requirements.txt as requires_dist
        elif pkg_info and get_depth(pkg_info['path']) <= get_depth(topdir) + 3:
            self.log.info(
                'Seems like this is sdist or egg, using PKG-INFO ...')
            requires_dist = []
            # in well-made sdists, there are requires.txt next to PKG_INFO
            #  (this is something different that requirements.txt)
            #  TODO: maybe mercator could do this in future
            requires = os.path.join(os.path.dirname(pkg_info['path']),
                                    'requires.txt')
            if os.path.exists(requires):
                self.log.info(
                    'Found a "requires.txt" file next to PKG-INFO, going to use it ...'
                )
                requires_dist = self._parse_requires_txt(requires)
            elif requirements_txt:
                self.log.info(
                    'No "requires.txt" file found next to PKG-INFO, but requirements.txt'
                    ' found, going to use it')
                # if requires.txt can't be found, try requirements.txt
                requires_dist = requirements_txt['result']['dependencies']
            else:
                self.log.info(
                    'Found no usable source of requirements for PKG-INFO :(')
            pkg_info['result']['requires_dist'] = requires_dist
            ret = pkg_info
        elif requirements_txt:
            self.log.info('Only requirements.txt found, going to use it ...')
            requirements_txt['result']['requires_dist'] = \
                requirements_txt['result'].pop('dependencies')
            ret = requirements_txt

        return ret

    def execute(self, arguments):
        "Execute mercator and convert it's output to JSON object"
        self._strict_assert(arguments.get('ecosystem'))

        if 'url' in arguments:
            # run mercator on a git repo
            return self.run_mercator_on_git_repo(arguments)

        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        # TODO: make this even uglier; looks like we didn't get the abstraction quite right
        #       when we were adding support for Java/Maven.
        if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by(
                EcosystemBackend.maven):
            # cache_path now points directly to the pom
            cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml()
        else:
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()
        return self.run_mercator(arguments, cache_path)

    def run_mercator_on_git_repo(self, arguments):
        self._strict_assert(arguments.get('url'))

        workdir = None
        try:
            workdir = tempfile.mkdtemp()
            repo_url = arguments.get('url')
            repo = Git.clone(repo_url, path=workdir, depth=str(1))
            metadata = self.run_mercator(arguments,
                                         workdir,
                                         keep_path=True,
                                         outermost_only=False,
                                         timeout=900)
            if metadata.get('status', None) != 'success':
                self.log.error('Mercator failed on %s', repo_url)
                return None

            # add some auxiliary information so we can later find the manifest file
            head = repo.rev_parse(['HEAD'])[0]
            for detail in metadata['details']:
                path = detail['path'][len(workdir):]
                # path should look like this:
                # <git-sha1>/path/to/manifest.file
                detail['path'] = head + path

            return metadata
        finally:
            if workdir:
                shutil.rmtree(workdir)

    def run_mercator(self,
                     arguments,
                     cache_path,
                     keep_path=False,
                     outermost_only=True,
                     timeout=300):
        result_data = {'status': 'unknown', 'summary': [], 'details': []}

        mercator_target = arguments.get('cache_sources_path', cache_path)
        tc = TimedCommand(['mercator', mercator_target])
        status, data, err = tc.run(
            timeout=timeout,
            is_json=True,
            update_env={'MERCATOR_JAVA_RESOLVE_POMS': 'true'})
        if status != 0:
            self.log.error(err)
            result_data['status'] = 'error'
            return result_data
        ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem'])
        if ecosystem_object.is_backed_by(EcosystemBackend.pypi):
            # TODO: attempt static setup.py parsing with mercator
            items = [self._merge_python_items(mercator_target, data)]
        else:
            if outermost_only:
                # process only root level manifests (or the ones closest to the root level)
                items = self._data_normalizer.get_outermost_items(
                    data.get('items') or [])
            else:
                items = data.get('items') or []
            self.log.debug('mercator found %i projects, outermost %i',
                           len(data), len(items))

            if ecosystem_object.is_backed_by(EcosystemBackend.maven):
                # for maven we download both Jar and POM, we consider POM to be *the*
                #  source of information and don't want to duplicate info by including
                #  data from pom included in artifact (assuming it's included)
                items = [
                    data for data in items
                    if data['ecosystem'].lower() == 'java-pom'
                ]
        result_data['details'] = [
            self._data_normalizer.handle_data(data, keep_path=keep_path)
            for data in items
        ]

        result_data['status'] = 'success'
        return result_data
Example #22
0
class LicenseCheckTask(BaseTask):
    _analysis_name = 'source_licenses'
    description = "Check licences of all files of a package"
    schema_ref = SchemaRef(_analysis_name, '3-0-0')

    @staticmethod
    def process_output(data):
        # not interested in these
        keys_to_remove = [
            'start_line', 'end_line', 'matched_rule', 'score', 'key'
        ]
        # 'files' is a list of file paths along with info about detected licenses.
        # If there's the same license text in most files, then almost the same license info
        # accompanies each file path.
        # Therefore transform it into dict of licenses (keys) along with info about the license plus
        # paths of files where the license has been detected.
        licenses = {}
        for file in data.pop('files'):
            for _license in file['licenses']:
                # short_name becomes key
                short_name = _license.pop('short_name')
                if short_name not in licenses.keys():
                    for key in keys_to_remove:
                        del _license[key]
                    _license['paths'] = {file['path']}
                    licenses[short_name] = _license
                else:
                    licenses[short_name]['paths'].add(file['path'])
        for l in licenses.values():
            l['paths'] = list(l['paths'])  # set -> list
        data['licenses'] = licenses

        del data['scancode_options']
        return data

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception:
            eco = arguments.get('ecosystem')
            pkg = arguments.get('name')
            ver = arguments.get('version')
            if arguments['ecosystem'] != 'maven':
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = {'status': 'unknown', 'summary': {}, 'details': {}}
        try:
            command = [
                os.path.join(
                    os.getenv('SCANCODE_PATH', '/opt/scancode-toolkit/'),
                    'scancode'),
                # Scan for licenses
                '--license',
                # Do not return license matches with scores lower than this score
                '--license-score',
                SCANCODE_LICENSE_SCORE,
                # Files without findings are omitted
                '--only-findings',
                # Use n parallel processes
                '--processes',
                SCANCODE_PROCESSES,
                # Do not print summary or progress messages
                '--quiet',
                # Strip the root directory segment of all paths
                '--strip-root',
                # Stop scanning a file if scanning takes longer than a timeout in seconds
                '--timeout',
                SCANCODE_TIMEOUT,
                cache_path
            ]
            output = TimedCommand.get_command_output(command,
                                                     graceful=False,
                                                     is_json=True,
                                                     timeout=600)
            details = self.process_output(output)
            result_data['details'] = details
            result_data['status'] = 'success'
            result_data['summary'] = {
                'sure_licenses': list(details['licenses'].keys())
            }
        except:
            self.log.exception("License scan failed")
            result_data['status'] = 'error'

        return result_data
Example #23
0
class CVEcheckerTask(BaseTask):
    name = 'cucoslib.workers.CVEchecker'
    _analysis_name = 'security_issues'
    description = "Security issues scanner. Uses Snyk vulndb for npm and OWASP Dep.Check for maven"
    schema_ref = SchemaRef(_analysis_name, '3-0-0')

    @staticmethod
    def _filter_vulndb_fields(entry):
        result = {
            'cvss': {
                'score': 0,
                'vector': ""
            }
        }
        for field in ['description', 'severity']:
            result[field] = entry.get(field)
        id = entry.get('identifiers', {}).get('CVE') or entry.get('identifiers', {}).get('CWE')
        result['id'] = id[0] if id else ''
        # prefer CVSSv2, because CVSSv3 seems to contain only vector string, not score itself
        if entry.get('CVSSv2'):
            # "CVSSv2": "7.5 (HIGH) (AV:N/AC:L/Au:N/C:P/I:P/A:P)"
            try:
                score, severity, vector = entry.get('CVSSv2').split(' ')
                score = float(score)
                vector = vector.strip('()')
            except ValueError:
                pass
            else:
                result['cvss']['score'] = score
                result['cvss']['vector'] = vector
        elif entry.get('CVSSv3'):
            # "CVSSv3": "CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H" <- there's no score ??
            result['cvss']['score'] = 0  # ?
            result['cvss']['vector'] = entry.get('CVSSv3')
        # Snyk vulndb doesn't contain references
        result['references'] = []
        return result

    def _npm_scan(self, arguments):
        """
        Query Snyk vulndb stored on S3
        """
        s3 = StoragePool.get_connected_storage('S3Snyk')

        try:
            self.log.debug('Retrieving Snyk vulndb from S3')
            vulndb = s3.retrieve_vulndb()
        except:
            self.log.error('Failed to obtain Snyk vulndb database')
            return {'summary': ['Failed to obtain Snyk vulndb database'],
                    'status': 'error',
                    'details': []}

        entries = []
        solver = get_ecosystem_solver(self.storage.get_ecosystem('npm'))
        for entry in vulndb.get('npm', {}).get(arguments['name'], []):
            vulnerable_versions = entry['semver']['vulnerable']
            affected_versions = solver.solve(["{} {}".format(arguments['name'],
                                                             vulnerable_versions)],
                                             all_versions=True)
            if arguments['version'] in affected_versions.get(arguments['name'], []):
                entries.append(self._filter_vulndb_fields(entry))

        return {'summary': [e['id'] for e in entries if e],
                'status': 'success',
                'details': entries}

    def _run_owasp_dep_check(self, scan_path, experimental=False):
        def _clean_dep_check_tmp():
            for dcdir in glob.glob(os.path.join(gettempdir(), 'dctemp*')):
                rmtree(dcdir)

        s3 = StoragePool.get_connected_storage('S3OWASPDepCheck')
        depcheck = os.path.join(os.environ['OWASP_DEP_CHECK_PATH'], 'bin', 'dependency-check.sh')
        with tempdir() as temp_data_dir:
            retrieved = s3.retrieve_depcheck_db_if_exists(temp_data_dir)
            if not retrieved:
                self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...')
                command = [depcheck, '--updateonly', '--data', temp_data_dir]
                # give DependencyCheck 30 minutes to download the DB
                TimedCommand.get_command_output(command, graceful=False, timeout=1800)
            report_path = os.path.join(temp_data_dir, 'report.xml')
            command = [depcheck,
                       '--noupdate',
                       '--format', 'XML',
                       '--project', 'test',
                       '--data', temp_data_dir,
                       '--scan', scan_path,
                       '--out', report_path]
            if experimental:
                command.extend(['--enableExperimental'])
            output = []
            try:
                self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' %
                               scan_path)
                output = TimedCommand.get_command_output(command,
                                                         graceful=False,
                                                         timeout=600)  # 10 minutes
                with open(report_path) as r:
                    report_dict = anymarkup.parse(r.read())
            except (TaskError, FileNotFoundError) as e:
                _clean_dep_check_tmp()
                for line in output:
                    self.log.warning(line)
                self.log.exception(str(e))
                return {'summary': ['OWASP Dependency-Check scan failed'],
                        'status': 'error',
                        'details': []}
            # If the CVEDBSyncTask has never been run before, we just had to create the DB ourselves
            # Make the life easier for other workers and store it to S3
            s3.store_depcheck_db_if_not_exists(temp_data_dir)
            _clean_dep_check_tmp()


        results = []
        dependencies = report_dict.get('analysis', {}).get('dependencies', {}).get('dependency', [])
        if not isinstance(dependencies, list):
            dependencies = [dependencies]
        for dependency in dependencies:
            vulnerabilities = dependency.get('vulnerabilities', {}).get('vulnerability', [])
            if not isinstance(vulnerabilities, list):
                vulnerabilities = [vulnerabilities]
            for vulnerability in vulnerabilities:
                av = vulnerability.get('cvssAccessVector')
                av = av[0] if av else '?'
                ac = vulnerability.get('cvssAccessComplexity')
                ac = ac[0] if ac else '?'
                au = vulnerability.get('cvssAuthenticationr')
                au = au[0] if au else '?'
                c = vulnerability.get('cvssConfidentialImpact')
                c = c[0] if c else '?'
                i = vulnerability.get('cvssIntegrityImpact')
                i = i[0] if i else '?'
                a = vulnerability.get('cvssAvailabilityImpact')
                a = a[0] if a else '?'
                vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{I}/A:{A}".\
                    format(AV=av, AC=ac, Au=au, C=c, I=i, A=a)
                result = {
                    'cvss': {
                        'score': vulnerability.get('cvssScore'),
                        'vector': vector
                    }
                }
                references = vulnerability.get('references', {}).get('reference', [])
                if not isinstance(references, list):
                    references = [references]
                result['references'] = [r.get('url') for r in references]
                for field in ['severity', 'description']:
                    result[field] = vulnerability.get(field)
                result['id'] = vulnerability.get('name')
                results.append(result)

        return {'summary': [r['id'] for r in results],
                'status': 'success',
                'details': results}

    def _maven_scan(self, arguments):
        """
        Run OWASP dependency-check
        """
        jar_path = ObjectCache.get_from_dict(arguments).get_source_tarball()
        return self._run_owasp_dep_check(jar_path, experimental=False)

    def _python_scan(self, arguments):
        """
        Run OWASP dependency-check experimental analyzer for Python artifacts

        https://jeremylong.github.io/DependencyCheck/analyzers/python-analyzer.html
        """
        tarball = ObjectCache.get_from_dict(arguments).get_source_tarball()
        if tarball.endswith('zip') or tarball.endswith('.whl'):  # tar.gz seems to be not supported
            scan_path = tarball
        else:
            extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()
            # depcheck needs to be pointed to a specific file, we can't just scan whole directory
            egg_info, pkg_info, metadata = None, None, None
            for root, dirs, files in os.walk(extracted_tarball):
                if root.endswith('.egg-info'):
                    egg_info = root
                if 'PKG-INFO' in files:
                    pkg_info = os.path.join(root, 'PKG-INFO')
                if 'METADATA' in files:
                    metadata = os.path.join(root, 'METADATA')

            scan_path = egg_info or pkg_info or metadata

        if not scan_path:
            return {'summary': ['File types not supported by OWASP dependency-check'],
                    'status': 'error',
                    'details': []}

        return self._run_owasp_dep_check(scan_path, experimental=True)

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        if arguments['ecosystem'] == 'maven':
            return self._maven_scan(arguments)
        elif arguments['ecosystem'] == 'npm':
            return self._npm_scan(arguments)
        elif arguments['ecosystem'] == 'pypi':
            return self._python_scan(arguments)
        else:
            return {'summary': ['Unsupported ecosystem'],
                    'status': 'error',
                    'details': []}
class BlackDuckHub(object):
    """
    Hub provides access around Black Duck Hub APIs
    """

    # The authentication token is returned in a cookie with this name
    COOKIE_NAME = 'JSESSIONID'

    def __init__(self, url):
        self._url = url
        self._session = None

    @property
    def url(self):
        """ URL of the Hub with trailing slash, example `https://hub.blackducksoftware.com/` """
        return self._url

    def _api(self, param):
        """
        Format a new API call, checks session validity as well

        :param param: str, parameters to append to base url
        :return: str, formatted API call
        """
        return "{}{}".format(self.url, param)

    def _api_get(self, param):
        """
        Perform a get request against the API using local `_session`

        :param param: str, full request URL
        :return: requests.Request, a request object
        """
        return get(self._api(param),
                   cookies={self.COOKIE_NAME: self._session.api_token.token},
                   verify=False)

    def connect_session(self, username, password):
        """
        Establishes a new session with the HUB using the provided credentials

        :param username: str
        :param password: str
        :return: BlackDuckSession, a session object
        :raises: BlackDuckSessionException
        """
        req = post(self._api("j_spring_security_check"),
                   data={
                       'j_username': username,
                       'j_password': password
                   },
                   verify=False)

        if req.status_code != 204:
            raise BlackDuckSessionException("Black Duck authentication error")

        token = req.cookies.get(self.COOKIE_NAME)
        self._session = BlackDuckSession(BlackDuckApiToken(token))

        return self._session

    @needs_session
    def find_project(self, name):
        """
        Find a Project by Name

        :param name: str, name of the project
        :return: BlackDuckProject, found project or `None`
        :raises: BlackDuckSessionException
        """
        preq = self._api_get('api/v1/projects?name=' + name)
        if preq.status_code == 200:
            pdata = preq.json()
            return BlackDuckProject(pdata)
        else:
            return None

    @needs_session
    @schema.result(SchemaRef("blackduck-project-list", "1-0-0"))
    def _list_projects_json(self):
        req = self._api_get('api/projects/')
        if req.status_code == 200:
            return req.json()
        else:
            raise BlackDuckException('Unable to list projects')

    def list_projects(self):
        """
        Lists all projects valid for the current session

        :return: List[BlackDuckProject], list of projects
        :raises: BlackDuckException, BlackDuckSessionException
        """
        names = [project['name'] for project in self._list_projects_json()]
        projects = []

        for name in names:
            projects.append(self.find_project(name))

        return projects

    @needs_session
    def get_releases(self, project_id):
        """
        Get all releases of the given project

        :param project_id: BlackDuckProject or str, project reference or ID
        :return: Dict[str, BlackDuckRelease], a map of version strings to release objects
        :raises: BlackDuckException, BlackDuckSessionException
        """
        if isinstance(project_id, BlackDuckProject):
            project_id = project_id.id

        req = self._api_get(
            'api/v1/projects/{id}/version-summaries'.format(id=project_id))
        if req.status_code == 200:
            data = req.json()
            return {
                obj['version']: BlackDuckRelease(obj, project_id)
                for obj in data['items']
            }
        else:
            raise BlackDuckException('Unable to fetch releases for ' +
                                     project_id)

    @needs_session
    @schema.result(SchemaRef("blackduck-vulnerable-bom", "1-0-0"))
    def get_release_bom_json(self, release_id):
        """
        Get the Bill of Materials for specific release

        :param release_id: BlackDuckRelease or str, release reference or ID
        :return: dict, the BOM JSON as a dictionary
        :raises: BlackDuckException, BlackDuckSessionException
        """
        release = release_id

        if isinstance(release_id, BlackDuckRelease):
            release_id = release_id.id

        req = self._api_get(
            'api/projects/{p}/versions/{i}/vulnerable-bom-components'.format(
                i=release_id, p=release.project))
        if req.status_code == 200:
            return req.json()
        else:
            raise BlackDuckException('Unable to fetch release information ' +
                                     release_id + " " + release.project)

    @needs_session
    def get_release_code_locations(self, release_id):
        """
        Get code locations for given release

        :param release_id: BlackDuckRelease or str, release reference or ID
        :return: dict, response json containing the retrieved code locations list
        :raises: BlackDuckException, BlackDuckSessionException
        """
        release = release_id

        if isinstance(release_id, BlackDuckRelease):
            release_id = release_id.id

        req = self._api_get(
            'api/projects/{p}/versions/{i}/codelocations'.format(
                i=release_id, p=release.project))

        if req.status_code == 200:
            return req.json()
        else:
            raise BlackDuckException(
                'Unable to fetch code locations for {relid} {relproj}'.format(
                    relid=release_id, relproj=release.project))

    @needs_session
    def get_code_location_scan_summary(self, location_id):
        """
        Get scan summary for given code location ID

        :param location_id: str
        :return: dict, the code location
        :raises: BlackDuckException, BlackDuckSessionException
        """
        req = self._api_get('api/codelocations/{locid}/scan-summaries'.format(
            locid=location_id))

        if req.status_code == 200:
            return req.json()
        else:
            raise BlackDuckException(
                'Unable to fetch scan summary for code location {locid}'.
                format(locid=location_id))
class GithubTask(BaseTask):
    description = 'Collects statistics using Github API'
    _analysis_name = "github_details"
    schema_ref = SchemaRef(_analysis_name, '1-0-4')
    # used for testing
    _repo_name = None
    _repo_url = None

    @classmethod
    def create_test_instance(cls, repo_name, repo_url):
        instance = super().create_test_instance()
        # set for testing as we are not querying DB for mercator results
        instance._repo_name = repo_name
        instance._repo_url = repo_url
        return instance

    @staticmethod
    def _get_last_years_commits(repo):
        activity = repo.get_stats_commit_activity()
        if not activity:
            return []
        return [x.total for x in activity]

    @staticmethod
    def _rate_limit_exceeded(gh):
        return gh.rate_limiting[0] == 0

    def _issues_or_prs_count(self, gh, query):
        # Check the rate-limit for Github API first. Apply retry if needed
        if self._rate_limit_exceeded(gh):
            retrytime = gh.rate_limiting_resettime - int(
                datetime.datetime.now().timestamp()) + 10
            self.log.info("Github rate-limit exceeded, retrying in %d seconds",
                          retrytime)
            self.retry(countdown=retrytime)
        items = gh.search_issues(query=query)
        return getattr(items, 'totalCount', -1)

    @staticmethod
    def _get_repo_stats(repo):
        # len(list()) is workaround for totalCount being None
        # https://github.com/PyGithub/PyGithub/issues/415
        d = {'contributors_count': len(list(repo.get_contributors()))}
        for prop in REPO_PROPS:
            d[prop] = repo.raw_data.get(prop, -1)
        return d

    def _query_repo_name(self):
        """Retrieve GitHub repo from a preceding Mercator scan"""
        # Fridolin: most of the checks can be removed since Dispatcher schedules this task iff we have github.com
        wr = self.parent_task_result('metadata')
        if wr is None:
            self.log.error(
                "No repo_name provided, and no Mercator scan result")
            return None
        code_repos =\
            [m.get("code_repository") for m in wr.get('details', []) if m.get("code_repository")]
        repo_details = code_repos[0] if code_repos else None
        if repo_details is None:
            self.log.debug("No repo_name provided, and no repo metadata found")
            return None
        repo_name = repo_details.get("url")
        if repo_name is None:
            self.log.debug('No repo name extracted, nothing to do')
            return None
        parsed = parse_gh_repo(repo_name)
        if not parsed:
            self.log.debug('Could not parse Github repo URL %s', repo_name)
        else:
            self._repo_url = 'https://github.com/' + parsed
        return parsed

    def _get_topics(self):
        if not self._repo_url:
            return []

        pop = requests.get('{url}'.format(url=self._repo_url))
        poppage = bs4.BeautifulSoup(pop.text, 'html.parser')

        topics = []
        for link in poppage.find_all("a", class_="topic-tag"):
            topics.append(link.text.strip())

        return topics

    def execute(self, arguments):
        result_data = {'status': 'unknown', 'summary': [], 'details': {}}
        # For testing purposes, a repo may be specified at task creation time
        if self._repo_name is None:
            # Otherwise, get the repo name from earlier Mercator scan results
            self._repo_name = self._query_repo_name()
            if self._repo_name is None:
                # Not a GitHub hosted project
                return result_data

        token = self.configuration.github_token
        if not token:
            if self._rate_limit_exceeded(github.Github()):
                self.log.error(
                    "No Github API token provided (GITHUB_TOKEN env variable), "
                    "and rate limit exceeded! "
                    "Ending now to not wait endlessly")
                result_data['status'] = 'error'
                return result_data
            else:
                self.log.warning(
                    "No Github API token provided (GITHUB_TOKEN env variable), "
                    "requests will be unauthenticated, "
                    "i.e. limited to 60 per hour")
        else:
            # there might be more comma-separated tokens, randomly select one
            token = random.choice(token.split(',')).strip()

        gh = github.Github(login_or_token=token)
        try:
            repo = gh.get_repo(full_name_or_id=self._repo_name, lazy=False)
        except github.GithubException as e:
            self.log.exception(str(e))
            result_data['status'] = 'error'
            return result_data

        result_data['status'] = 'success'

        # Get Count of Issues and PRs for last year and last month
        now = datetime.datetime.utcnow()
        month = (
            now -
            datetime.timedelta(days=MONTH_BACK)).strftime('%Y-%m-%dT%H:%M:%SZ')
        year = (
            now -
            datetime.timedelta(days=YEAR_BACK)).strftime('%Y-%m-%dT%H:%M:%SZ')
        now = now.strftime('%Y-%m-%dT%H:%M:%SZ')

        issues_closed_year = self._issues_or_prs_count(
            gh,
            query='repo:' + repo.full_name + ' closed:' + year + '..' + now +
            ' type:issue')
        issues_closed_month = self._issues_or_prs_count(
            gh,
            query='repo:' + repo.full_name + ' closed:' + month + '..' + now +
            ' type:issue')
        prs_closed_year = self._issues_or_prs_count(
            gh,
            query='repo:' + repo.full_name + ' closed:' + year + '..' + now +
            ' type:pr')
        prs_closed_month = self._issues_or_prs_count(
            gh,
            query='repo:' + repo.full_name + ' closed:' + month + '..' + now +
            ' type:pr')

        issues_opened_year = self._issues_or_prs_count(
            gh,
            query='repo:' + repo.full_name + ' created:' + year + '..' + now +
            ' type:issue')
        issues_opened_month = self._issues_or_prs_count(
            gh,
            query='repo:' + repo.full_name + ' created:' + month + '..' + now +
            ' type:issue')
        prs_opened_year = self._issues_or_prs_count(
            gh,
            query='repo:' + repo.full_name + ' created:' + year + '..' + now +
            ' type:pr')
        prs_opened_month = self._issues_or_prs_count(
            gh,
            query='repo:' + repo.full_name + ' created:' + month + '..' + now +
            ' type:pr')

        issues = {
            'updated_issues': {
                'year': {
                    'opened': issues_opened_year,
                    'closed': issues_closed_year
                },
                'month': {
                    'opened': issues_opened_month,
                    'closed': issues_closed_month
                }
            },
            'updated_pull_requests': {
                'year': {
                    'opened': prs_opened_year,
                    'closed': prs_closed_year
                },
                'month': {
                    'opened': prs_opened_month,
                    'closed': prs_closed_month
                }
            }
        }

        # Get Repo Statistics
        notoriety = self._get_repo_stats(repo)
        if notoriety:
            issues.update(notoriety)
        issues['topics'] = self._get_topics()

        # Get Commit Statistics
        last_year_commits = self._get_last_years_commits(repo)
        commits = {
            'last_year_commits': {
                'sum': sum(last_year_commits),
                'weekly': last_year_commits
            }
        }
        issues.update(commits)
        result_data['details'] = issues
        return result_data
 def test_next_revision(self):
     schema_ref = SchemaRef("example", "1-0-0")
     assert schema_ref.next_revision() == SchemaRef("example", "1-1-0")
 def test_next_model(self):
     schema_ref = SchemaRef("example", "1-0-0")
     assert schema_ref.next_model() == SchemaRef("example", "2-0-0")
class CodeMetricsTask(BaseTask):
    _analysis_name = 'code_metrics'
    description = 'Compute various code metrics for a project'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')
    _CLI_TIMEOUT = 300

    def _run_analyzer(self, command, json_output=True):
        """Run command (analyzer), if a JSON output is expected, parse it

        :param command: command to be run (command with argument vector as array)
        :param json_output: True if output should be parsed
        :return: status, output, error triplet
        """
        self.log.debug("Executing command, timeout={timeout}: {cmd}".format(
            timeout=self._CLI_TIMEOUT, cmd=command))
        cmd = TimedCommand(command)
        status, output, error = cmd.run(timeout=self._CLI_TIMEOUT)
        self.log.debug("status: %d, output: %s, error: %s", status, output,
                       error)

        if status != 0:
            self.log.warning(
                "Executing command failed, return value: %d, stderr: '%s' ",
                status, error)

        # Some tools such as complexity-report write zero bytes to output (they are propagated from sources like
        # for npm/glob/7.0.3). This caused failures when pushing results to Postgres as Postgres cannot store
        # null bytes in results. Let's be safe here.
        output = list(line.replace('\\u0000', '\\\\0') for line in output)

        if json_output:
            if output:
                output = "".join(output)
                output = json.loads(output)
            else:
                output = {}

        return status, output, error

    def _get_generic_result(self, source_path):
        """Get core result of CodeMetricsTask task that is based on cloc tool, this output is later enriched with
        output of tools based on languages that were found by cloc

        :param source_path: path to sources where analyzed artefact resists
        :return: tuple where generic information with ecosystem specific dict
        """
        command = ['cloc', '--json', source_path]
        status, output, error = self._run_analyzer(command)

        if status != 0:
            # Let the whole task fail
            raise RuntimeError("Running cloc command failed: '%s'" % error)

        # cloc places generic summary here, we will maintain it in top level so remove misleading key
        header = {
            'total_files': output['header'].pop('n_files'),
            'total_lines': output['header'].pop('n_lines')
        }
        output.pop('header')

        if 'SUM' in output:
            header['blank_lines'] = output['SUM']['blank']
            header['comment_lines'] = output['SUM']['comment']
            header['code_lines'] = output['SUM']['code']
            output.pop('SUM', None)

        # rename to be more precise with naming
        wanted_keys = (('blank', 'blank_lines'), ('code', 'code_lines'),
                       ('comment', 'comment_lines'), ('nFiles', 'files_count'))
        for key in output.keys():
            # filter only language-specific results, leave statistics untouched
            if isinstance(output[key], dict):
                output[key] = DataNormalizer.transform_keys(
                    output[key], wanted_keys)

        return header, output

    @staticmethod
    def _normalize_complexity_report_output(output, source_path):
        """ Normalize complexity_report output
        See https://github.com/escomplex/escomplex/blob/master/README.md#metrics

        :param output: output dict to be normalized
        :param source_path: path to sources that was used
        :return: normalized output
        """
        # For metrics meaning see:
        wanted_keys = (('maintainability', 'project_maintainability'),
                       ('changeCost', 'cost_change'),
                       ('cyclomatic', 'average_cyclomatic_complexity'),
                       ('effort',
                        'average_halstead_effort'), ('firstOrderDensity',
                                                     'first_order_density'),
                       ('loc', 'average_function_lines_of_code'),
                       ('params',
                        'average_function_parameters_count'), ('reports',
                                                               'modules'))
        output = DataNormalizer.transform_keys(output, wanted_keys)

        wanted_module_keys = (('maintainability',
                               'module_maintainability'), ('dependencies', ),
                              ('loc',
                               'average_function_lines_of_code'), ('path', ),
                              ('params', 'average_function_parameters_count'),
                              ('functions', ))

        for idx, module in enumerate(output.get('modules', [])):
            output['modules'][idx] = DataNormalizer.transform_keys(
                module, wanted_module_keys)

            source_path_len = len(source_path) + 1
            if 'path' in module:
                output['modules'][idx]['path'] = module['path'][
                    source_path_len:]

            for fun_idx, function in enumerate(module.get('functions')):
                if 'cyclomaticDensity' in function:
                    function['cyclomatic_density'] = function.pop(
                        'cyclomaticDensity')

        return output

    @staticmethod
    def _normalize_javancss_output(output):
        """Parse and normalize JavaNCSS ASCII output

        :param output: output dict to be normalized
        :return: normalized output
        """
        output = output.get('javancss', {})
        result = {'functions': {}, 'objects': {}, 'packages': {}}

        # The output of JavaNCSS is an XML, which is parsed using anymarkup. This can introduce some pitfalls here
        # if there is found exactly one item of a type. E.g.:
        #
        #  <functions>
        #    <function>...<function/>
        #  <functions>
        #
        # Is parsed as object 'functions' containing *one object* 'function', whereas:
        #
        #  <functions>
        #    <function>...<function/>
        #    <function>...<function/>
        #  <functions>
        #
        # Is parsed as object 'functions' containing a *list of objects* 'function'. Thus the isinstance(.., list)
        # checks.

        # Parse functions section
        if 'functions' in output:
            functions = output['functions']

            wanted_function_keys = (('ccn', 'cyclomatic_complexity'),
                                    ('javadocs', ), ('name', ))

            result['functions']['function'] = []
            if 'function' in functions:
                if not isinstance(functions['function'], list):
                    functions['function'] = [functions['function']]

                for function in functions['function']:
                    result['functions']['function'].append(
                        DataNormalizer.transform_keys(function,
                                                      wanted_function_keys))

            function_averages = functions.get('function_averages', {})

            result['functions'][
                'average_cyclomatic_complexity'] = function_averages.get('ccn')
            result['functions']['average_javadocs'] = function_averages.get(
                'javadocs')

        # Parse objects section
        if 'objects' in output:
            objects = output['objects']

            wanted_objects_keys = (('classes', ), ('functions', ), ('name', ),
                                   ('javadocs', ))

            result['objects']['object'] = []
            if 'object' in objects:
                if not isinstance(objects['object'], list):
                    objects['object'] = [objects['object']]

                for obj in objects['object']:
                    result['objects']['object'].append(
                        DataNormalizer.transform_keys(obj,
                                                      wanted_objects_keys))

            object_averages = objects.get('averages', {})

            result['objects']['average_classes'] = object_averages.get(
                'classes')
            result['objects']['average_functions'] = object_averages.get(
                'functions')
            result['objects']['average_javadocs'] = object_averages.get(
                'javadocs')

        # Parse packages section
        if 'packages' in output:
            packages = output['packages']

            packages_total = packages.get('total', {})

            result['packages']['classes'] = packages_total.get('classes')
            result['packages']['functions'] = packages_total.get('functions')
            result['packages']['javadoc_lines'] = packages_total.get(
                'javadoc_lines')
            result['packages']['javadocs'] = packages_total.get('javadocs')
            result['packages']['multi_comment_lines'] = packages_total.get(
                'multi_comment_lines')
            result['packages']['single_comment_lines'] = packages_total.get(
                'single_comment_lines')

        return result

    def _normalize_mccabe_output(self, output):
        result = []
        for line in output:
            # NOTE: due to the way print works in python 2 vs python 3, the mccabe under
            #  python 2 returns `(<coords> <name> <complexity>)`, while the python 3
            #  version returns the same without the brackets
            coords, func_name, complexity = line.split()
            result.append({
                'name': func_name.strip("'"),
                'complexity': int(complexity.strip(')'))
            })

        return result

    def complexity_report(self, source_path):
        """Run complexity_report tool https://www.npmjs.com/package/complexity-report

        :param source_path: path to source codes
        :return: normalized output
        """
        command = ['cr', '--format=json', source_path]
        status, output, error = self._run_analyzer(command)

        if status != 0:
            self.log.warning("Runing complexity report tool failed: %s", error)
            return {}

        if output:
            output = self._normalize_complexity_report_output(
                output, source_path)
        return output

    def javancss(self, source_path):
        """Run JavaNCSS tool http://www.kclee.de/clemens/java/javancss

        :param source_path: path to source codes
        :return normalized output
        """
        javancss_path = os.path.join(os.environ['JAVANCSS_PATH'], 'bin',
                                     'javancss')
        command = [javancss_path, '-all', '-xml', source_path]
        status, output, error = self._run_analyzer(command, json_output=False)

        if status != 0:
            self.log.warning("JavaNCSS tool reported some errors: %s", error)

        if output:
            output = anymarkup.parse("".join(output))
            output = self._normalize_javancss_output(output)

        return output

    def python_mccabe(self, source_path):
        """Run mccabe tool https://pypi.python.org/pypi/mccabe

        :param source_path: path to source codes
        :return: normalized output
        """
        result = {'files': []}
        # we'll compute total average cyclomatic complexity manually based as
        #  <total complexity>/<total number of functions>
        command = ['python3', '-m', 'mccabe']

        # mccabe has to be run on individual files, doesn't work recursively on directories
        for root, dirs, files in os.walk(source_path):
            for f in files:
                if f.endswith('.py'):
                    to_run = command + [os.path.join(root, f)]
                    status, output, error = self._run_analyzer(
                        to_run, json_output=False)
                    if status != 0:
                        self.log.info(
                            'Analyzing with Py3 failed, trying to analyze with Py2 ...'
                        )
                        to_run[0] = 'python2'
                        status, output, error = self._run_analyzer(
                            to_run, json_output=False)
                        if status != 0:
                            self.log.error(
                                'Failed to analyze with both Py2 and Py3')
                            continue
                    normalized = self._normalize_mccabe_output(output)

                    # compute file average cyclomatic complexity, add numbers
                    #  to overall package complexity
                    f_complexity = functools.reduce(
                        lambda x, y: x + y['complexity'], normalized, 0)
                    f_functions = len(normalized)
                    f_acc = round(f_complexity /
                                  f_functions, 1) if f_functions > 0 else 0
                    result['files'].append({
                        'name':
                        os.path.join(root, f)[len(source_path):].strip('/'),
                        'functions':
                        normalized,
                        'average_cyclomatic_complexity':
                        f_acc
                    })

        return result

    # A table that carries functions that should be called based on language that was found by cloc, keys has to match
    # keys in cloc output. Each handler expect one argument - path to the source where sources sit, the result is
    # a dict. When you write new analyzer handlers, make sure that there are no key collisions with new ones as results
    # are aggregated under "metrics" key.
    # See 'Recognized languages' section at http://cloc.sourceforge.net/
    _LANGUAGE_ANALYZER_HANDLERS = {
        "JavaScript": [
            complexity_report,
        ],
        "Ruby": [],
        "Java": [
            javancss,
        ],
        "Python": [
            python_mccabe,
        ],
        "Go": [],
        "Rust": []
    }

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        source_path = ObjectCache.get_from_dict(arguments).get_sources()
        header, language_stats = self._get_generic_result(source_path)

        for language in language_stats.keys():
            for handler in self._LANGUAGE_ANALYZER_HANDLERS.get(language, []):
                metrics_data = handler(self, source_path)
                if not metrics_data:
                    continue

                if 'metrics' not in language_stats[language]:
                    language_stats[language]['metrics'] = {}

                language_stats[language]['metrics'].update(metrics_data)

        # we don't want to have possibly unique keys and we want to avoid enumerating all languages that are
        # supported by cloc - convert a dict to a list of language-specific entries
        result = {'languages': []}
        for language in language_stats.keys():
            record = language_stats.get(language)
            record['language'] = language
            result['languages'].append(record)

        return {'summary': header, 'status': 'success', 'details': result}
 def test_next_addition(self):
     schema_ref = SchemaRef("example", "1-0-0")
     assert schema_ref.next_addition() == SchemaRef("example", "1-0-1")