def analyses(app):
    e1 = Ecosystem(name='npm', backend=EcosystemBackend.npm)
    p1 = Package(ecosystem=e1, name='arrify')
    v1 = Version(package=p1, identifier='1.0.1')
    model1 = Analysis(version=v1, started_at=now, finished_at=later)
    app.rdb.session.add(model1)

    e2 = Ecosystem(name='pypi', backend=EcosystemBackend.pypi)
    p2 = Package(ecosystem=e2, name='flexmock')
    v2 = Version(package=p2, identifier='0.10.1')
    model2 = Analysis(version=v2, started_at=later, access_count=1)
    app.rdb.session.add(model2)
    app.rdb.session.commit()

    worker_results2 = {'a': 'b', 'c': 'd', 'e': 'f', 'g': 'h', 'i': 'j',
                       'digests': {'details':
                                   [{'artifact': True,
                                     'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}}
    for w, tr in worker_results2.items():
        app.rdb.session.add(WorkerResult(analysis_id=model2.id, worker=w, task_result=tr))


    model3 = Analysis(version=v2, started_at=later, access_count=1,
                      audit={'audit': {'audit': 'audit', 'e': 'f', 'g': 'h'}, 'a': 'b', 'c': 'd'})
    app.rdb.session.add(model3)
    app.rdb.session.commit()
    worker_results3 = {'digests': {'details':
                                   [{'artifact': True,
                                     'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}}
    for w, tr in worker_results3.items():
        app.rdb.session.add(WorkerResult(analysis_id=model3.id, worker=w, task_result=tr))
    app.rdb.session.commit()
    return (model1, model2, model3)
def fill_analyses(app):
    ecosystems = [
        Ecosystem(name='pypi', backend=EcosystemBackend.pypi, url='https://pypi.python.org/',
                  fetch_url='https://pypi.python.org/pypi'),
        Ecosystem(name='npm', backend=EcosystemBackend.npm, url='https://www.npmjs.com/',
                  fetch_url='https://registry.npmjs.org/'),
        Ecosystem(name='go', backend=EcosystemBackend.scm),
    ]

    packages = [
        Package(name='flexmock', ecosystem=ecosystems[0]),
        Package(name='requests', ecosystem=ecosystems[0]),
        Package(name='sequence', ecosystem=ecosystems[1]),
        Package(name='arrify', ecosystem=ecosystems[1]),
        Package(name='serve-static', ecosystem=ecosystems[1]),
    ]

    versions = [
        Version(identifier='0.10.1', package=packages[0]),
        Version(identifier='0.9.1', package=packages[0]),
        Version(identifier='2.0.0', package=packages[1]),
        Version(identifier='2.2.1', package=packages[2]),
        Version(identifier='1.0.1', package=packages[3]),
        Version(identifier='1.7.1', package=packages[4]),
    ]

    analyses = [
        Analysis(version=versions[0], started_at=now),                    # pypi/flexmock/0.10.1
        Analysis(version=versions[0], started_at=later, access_count=1),  # pypi/flexmock/0.10.1
        Analysis(version=versions[1], started_at=even_later),             # pypi/flexmock/0.9.1
        Analysis(version=versions[2], started_at=now),                    # pypi/requests/2.0.0
        Analysis(version=versions[3], started_at=later),                  # npm/sequence/2.2.1
        Analysis(version=versions[4], started_at=now, finished_at=later), # npm/arrify/1.0.1
        Analysis(version=versions[5], started_at=now, finished_at=later,
                 release='npm:serve-static:1.7.1'),                      # npm/serve-static/1.7.1
    ]
    # worker results that correspond to analyses above
    worker_results = [
        WorkerResult(worker='digests', analysis=analyses[1],
                    task_result={'details': [{'artifact': True,
                                                'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}),
        WorkerResult(worker='static_analysis', task_result={'details': []}, analysis=analyses[1]),
        WorkerResult(worker='source_licenses',
                     task_result={'schema': {'name': 'source_licenses', 'version': '1-0-0'}},
                     analysis=analyses[1])
    ]
    package_gh_usage = [
        PackageGHUsage(name='arrify', count=100, ecosystem_backend='npm')
    ]
    for a in ecosystems + packages + versions + analyses + worker_results + package_gh_usage:
        app.rdb.session.add(a)
        app.rdb.session.commit()

    return (ecosystems, packages, versions, analyses, worker_results, package_gh_usage)
    def test_execute_with_mock_anitya(self, ecosystem, project, md5sum,
                                      dist_git):
        rdb()
        s = create_db_scoped_session()
        dummy_homepage = "http://project-homepage.com"

        dummy_response = Response()
        dummy_response.status_code = 200
        s.add(Ecosystem(name='npm', backend=EcosystemBackend.npm))
        s.commit()
        DownstreamMapCache(
        )[md5sum] = dist_git  # fill in key-value mapping in cache

        task = AnityaTask.create_test_instance(task_name='anitya')
        args = {'ecosystem': ecosystem, 'name': project}
        flexmock(task).should_receive(
            "_get_project_homepage").once().and_return(dummy_homepage)
        flexmock(task).should_receive("_get_artifact_hash").once().and_return(
            md5sum)
        flexmock(task).should_receive(
            "_create_anitya_project").once().and_return(dummy_response)
        flexmock(task).should_receive(
            "_add_downstream_mapping").once().and_return(dummy_response)

        results = task.execute(arguments=args)
        assert results is None
Esempio n. 4
0
def pypi(rdb):
    pypi = Ecosystem(name='pypi',
                     backend=EcosystemBackend.pypi,
                     fetch_url='https://pypi.python.org/pypi')
    rdb.add(pypi)
    rdb.commit()
    return pypi
Esempio n. 5
0
    def test_execute(self, tmpdir):
        artifact_digest, artifact_path = IndianaJones.fetch_artifact(
            Ecosystem(name='pypi', backend=EcosystemBackend.pypi),
            artifact=PYPI_MODULE_NAME,
            version=PYPI_MODULE_VERSION,
            target_dir=str(tmpdir))

        args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value')
        flexmock(EPVCache).should_receive(
            'get_extracted_source_tarball').and_return(str(tmpdir))
        flexmock(EPVCache).should_receive('get_source_tarball').and_return(
            artifact_path)
        task = DigesterTask.create_test_instance(task_name='digests')
        results = task.execute(arguments=args)

        assert results is not None
        assert isinstance(results, dict)
        assert set(results.keys()) == {'details', 'status', 'summary'}
        artifact_details = None
        for details in results['details']:
            assert {'sha256', 'sha1', 'md5', 'ssdeep',
                    'path'}.issubset(set(details.keys()))
            if details.get('artifact'):
                artifact_details = details
        # there are artifact details
        assert artifact_details is not None
        # the artifact digest which Indy returns is the same as the one from DigesterTask
        assert artifact_digest == artifact_details['sha256'] == compute_digest(
            artifact_path)
        assert artifact_details['path'] == 'six-1.0.0.tar.gz'
Esempio n. 6
0
def maven(rdb):
    maven = Ecosystem(name='maven',
                      backend=EcosystemBackend.maven,
                      fetch_url='')
    rdb.add(maven)
    rdb.commit()
    return maven
Esempio n. 7
0
def npm(rdb):
    npm = Ecosystem(name='npm',
                    backend=EcosystemBackend.npm,
                    fetch_url='https://registry.npmjs.org/')
    rdb.add(npm)
    rdb.commit()
    return npm
    def test_execute(self, tmpdir):
        npm = Ecosystem(name='npm', backend=EcosystemBackend.npm)
        flexmock(self.m.storage).should_receive('get_ecosystem').with_args(
            'npm').and_return(npm)
        name = 'wrappy'
        version = '1.0.2'
        required = {
            'homepage', 'version', 'declared_license', 'code_repository',
            'bug_reporting', 'description', 'name', 'author'
        }
        IndianaJones.fetch_artifact(npm,
                                    artifact=name,
                                    version=version,
                                    target_dir=str(tmpdir))

        args = {'ecosystem': npm.name, 'name': 'foo', 'version': 'bar'}
        flexmock(EPVCache).should_receive(
            'get_extracted_source_tarball').and_return(str(tmpdir))
        results = self.m.execute(arguments=args)
        assert results is not None
        assert isinstance(results, dict)

        details = results['details'][0]
        assert required.issubset(set(
            details.keys()))  # check at least the required are there
        assert all([details[key]
                    for key in list(required)])  # assert required are not None
        assert details['name'] == name
Esempio n. 9
0
def rubygems(rdb):
    rubygems = Ecosystem(name='rubygems',
                         backend=EcosystemBackend.rubygems,
                         fetch_url='https://rubygems.org/api/v1')
    rdb.add(rubygems)
    rdb.commit()
    return rubygems
Esempio n. 10
0
def fill_packages_for_paging(app, request):
    e = Ecosystem(name='pypi', backend=EcosystemBackend.pypi)
    app.rdb.session.add(e)
    for p in range(0, 11):
        app.rdb.session.add(Package(ecosystem=e, name=str(p)))

    app.rdb.session.commit()
Esempio n. 11
0
    def get(self, ecosystem):
        query = request.args.get('q')
        eco = Ecosystem.by_name(rdb.session, ecosystem)
        fetcher = CucosReleasesFetcher(eco, rdb.session)
        now = datetime.datetime.now()

        # Instantiate two different solvers, one using a custom fetcher to fetch
        # matching releases from Bayesian DB and the other one fetching from
        # upstream repositories.
        # The data from these two solvers then provide information as to:
        #   1) Which packages in the range we have already analysed and have information
        #        about
        #   2) Other packages from upstream repositories which match the version specification
        cucos_solver, solver = get_ecosystem_solver(eco, with_fetcher=fetcher),\
                               get_ecosystem_solver(eco)

        ours = cucos_solver.solve([query], all_versions=True)
        upstream = solver.solve([query], all_versions=True)

        ours_nums = set() if not ours else set(next(iter(ours.values())))
        upstreams_nums = set() if not upstream else set(
            next(iter(upstream.values())))

        return {
            'query': query,
            'detail': {
                'analysed': ours,
                'upstream': upstream,
                'difference': list(upstreams_nums - ours_nums)
            },
            'resolved_at': str(now)
        }
Esempio n. 12
0
    def get_sources(self):
        """
        :return: path to source files
        """
        if not self._eco_obj:
            self._eco_obj = Ecosystem.by_name(self._postgres.session,
                                              self.ecosystem)

        if self._eco_obj.is_backed_by(EcosystemBackend.maven):
            return self.get_extracted_source_jar()
        else:
            return self.get_extracted_source_tarball()
Esempio n. 13
0
    def has_sources(self):
        """
        :return: true if the given EPV has available sources
        """
        if not self._eco_obj:
            self._eco_obj = Ecosystem.by_name(self._postgres.session,
                                              self.ecosystem)

        if self._eco_obj.is_backed_by(EcosystemBackend.maven):
            return self._s3.object_exists(self._source_jar_object_key)
        else:
            self._construct_source_tarball_names()
            return self._s3.object_exists(self._source_tarball_object_key)
    def setup_method(self, method):
        rdb()
        self.s = create_db_scoped_session()
        self.en = 'foo'
        self.pn = 'bar'
        self.vi = '1.1.1'
        self.e = Ecosystem(name=self.en, backend=EcosystemBackend.maven)
        self.p = Package(ecosystem=self.e, name=self.pn)
        self.v = Version(package=self.p, identifier=self.vi)
        self.a = Analysis(version=self.v, finished_at=datetime.datetime.now())
        self.a2 = Analysis(version=self.v,
                           finished_at=datetime.datetime.now() +
                           datetime.timedelta(seconds=10))
        self.s.add(self.a)
        self.s.add(self.a2)
        self.s.commit()

        self.bp = BayesianPostgres(
            connection_string=get_postgres_connection_string())
    def execute(self, arguments):
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('ecosystem'))

        # get rid of version if scheduled from the core analyses
        arguments.pop('version', None)

        db = self.storage.session
        ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        package = Package.get_or_create(db,
                                        ecosystem_id=ecosystem.id,
                                        name=arguments['name'])
        upstream = self.get_upstream_entry(db, package,
                                           self.get_upstream_url(arguments))
        arguments['url'] = upstream.url

        if not arguments.get('force'):
            # can potentially schedule two flows of a same type at the same time as there is no lock,
            # but let's say it's OK
            if upstream.updated_at is not None \
                    and upstream.updated_at - datetime.datetime.now() < self._UPDATE_INTERVAL:
                self.log.info(
                    'Skipping upstream package check as data are considered as recent - last update %s.',
                    upstream.updated_at)
                # keep track of start, but do not schedule nothing more
                # discard changes like updates
                db.rollback()
                return arguments

        # if this fails, it's actually OK, as there could be concurrency
        package_analysis = PackageAnalysis(package_id=package.id,
                                           started_at=datetime.datetime.now(),
                                           finished_at=None)
        db.add(package_analysis)

        # keep track of updates
        upstream.updated_at = datetime.datetime.now()

        db.commit()
        arguments['document_id'] = package_analysis.id
        return arguments
Esempio n. 16
0
def _make_ecosystem(name):
    return Ecosystem(name=name, backend=getattr(EcosystemBackend, name))
    def execute(self, arguments):
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        self._strict_assert(arguments.get('ecosystem'))

        db = self.storage.session
        e = Ecosystem.by_name(db, arguments['ecosystem'])
        p = Package.get_or_create(db,
                                  ecosystem_id=e.id,
                                  name=arguments['name'])
        v = Version.get_or_create(db,
                                  package_id=p.id,
                                  identifier=arguments['version'])

        if not arguments.get('force'):
            # TODO: this is OK for now, but if we will scale and there will be 2+ workers running this task
            # they can potentially schedule two flows of a same type at the same time
            if db.query(Analysis).filter(
                    Analysis.version_id == v.id).count() > 0:
                # we need to propagate flags that were passed to flow, but not E/P/V - this way we are sure that for
                # example graph import is scheduled (arguments['force_graph_sync'] == True)
                arguments.pop('name')
                arguments.pop('version')
                arguments.pop('ecosystem')
                return arguments

        cache_path = mkdtemp(dir=self.configuration.worker_data_dir)
        epv_cache = ObjectCache.get_from_dict(arguments)
        ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])

        try:
            if not epv_cache.has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path)
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(
                            cache_path, ecosystem, arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{e}/{p}/{v}": {err}'
                            .format(e=arguments.get('ecosystem'),
                                    p=arguments.get('name'),
                                    v=arguments.get('version'),
                                    err=str(e)))

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(
                        cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)

        a = Analysis(version=v,
                     access_count=1,
                     started_at=datetime.datetime.now())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id
        return arguments
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        eco = arguments['ecosystem']
        pkg = arguments['name']
        tool_responses = {}
        result_summary = {
            'package_names': [],
            'registered_srpms': [],
            'all_rhn_channels': [],
            'all_rhsm_content_sets': [],
            'all_rhsm_product_names': []
        }
        result_data = {'status': 'error',
                       'summary': result_summary,
                       'details': tool_responses
                       }

        # bail out early; we need access to internal services or the package is from Maven ecosystem,
        # otherwise we can't comment on downstream usage
        is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven)
        if not self._is_inside_rh() and not is_maven:
            return result_data

        self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg))
        res = self._fetch_anitya_project(eco, pkg)
        anitya_rpm_names = []
        anitya_mvn_names = []
        if res is None:
            result_data['status'] = 'error'
        elif res.status_code == 200:
            self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg))
            anitya_response = res.json()
            tool_responses['redhat_anitya'] = anitya_response
            # For now, we assume all downstreams are ones we care about
            for entry in anitya_response['packages']:
                if entry['distro'] == RH_RPM_DISTRO_NAME:
                    anitya_rpm_names.append(entry['package_name'])
                elif entry['distro'] == RH_MVN_DISTRO_NAME:
                    anitya_mvn_names.append(entry['package_name'])
                else:
                    self.log.warning(
                        'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'.
                                     format(d=entry['distro'], o=entry['package_name'], p=pkg)
                    )
            self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names))
            self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names))
            # TODO: Report 'partial' here and switch to 'success' at the end
            result_data['status'] = 'success'
        else:
            msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}'
            self.log.error(msg.format(e=eco, p=pkg, r=res.text))
            result_data['status'] = 'error'

        if self._is_inside_rh():
            # we have candidate downstream name mappings, check them against Brew
            seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)]
            self.log.debug('Checking candidate names in Brew: {}'.format(seed_names))

            args = ['brew-utils-cli', '--version', arguments['version']]
            artifact_hash = self._get_artifact_hash(algorithm='sha256')
            if artifact_hash:
                args += ['--digest', artifact_hash]
            args += seed_names

            self.log.debug("Executing command, timeout={timeout}: {cmd}".format(timeout=self._BREWUTILS_CLI_TIMEOUT,
                                                                                cmd=args))
            tc = TimedCommand(args)
            status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT)
            self.log.debug("status = %s, error = %s", status, error)
            output = ''.join(output)
            self.log.debug("output = %s", output)
            if not output:
                raise TaskError("Error running command %s" % args)
            brew = json.loads(output)

            result_summary['package_names'] = brew['packages']
            result_summary['registered_srpms'] = brew['response']['registered_srpms']
            tool_responses['brew'] = brew['response']['brew']

            # we have SRPM details, fetch details on where the RPMs are shipped
            tool_responses['pulp_cdn'] = pulp_responses = []
            rhn_channels = set()
            rhsm_content_sets = set()
            rhsm_product_names = set()
            for srpm_summary in result_summary['registered_srpms']:
                srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'],
                                                             v=srpm_summary['version'],
                                                             r=srpm_summary['release'])
                cdn_metadata = self._get_cdn_metadata(srpm_filename)
                if cdn_metadata is None:
                    msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}'
                    self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename))
                    continue
                pulp_responses.append(cdn_metadata)
                srpm_summary['published_in'] = cdn_metadata['rhsm_product_names']
                rhn_channels.update(cdn_metadata['rhn_channels'])
                rhsm_content_sets.update(cdn_metadata['rhsm_content_sets'])
                rhsm_product_names.update(cdn_metadata['rhsm_product_names'])
            result_summary['all_rhn_channels'] = sorted(rhn_channels)
            result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets)
            result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names)

        self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version'])

        return result_data
Esempio n. 19
0
# -*- coding: utf-8 -*-
import pytest
from flexmock import flexmock
from cucoslib.object_cache import EPVCache
from cucoslib.enums import EcosystemBackend
from cucoslib.workers import LinguistTask
from cucoslib.models import Ecosystem
from cucoslib.process import IndianaJones

ECOSYSTEM = Ecosystem(name='pypi', backend=EcosystemBackend.pypi)
MODULE_NAME = 'six'
MODULE_VERSION = '1.10.0'


@pytest.mark.usefixtures("dispatcher_setup")
class TestLinguist(object):
    @pytest.mark.usefixtures("no_s3_connection")
    def test_execute(self, tmpdir):
        IndianaJones.fetch_artifact(
            ecosystem=ECOSYSTEM, artifact=MODULE_NAME,
            version=MODULE_VERSION, target_dir=str(tmpdir))

        args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value')
        flexmock(EPVCache).should_receive('get_extracted_source_tarball').and_return(str(tmpdir))
        task = LinguistTask.create_test_instance(task_name='languages')
        results = task.execute(args)

        assert results is not None
        assert isinstance(results, dict)
        assert set(results.keys()) == {'details', 'status', 'summary'}
        details = results['details']
    def get_ecosystem(self, name):
        if not self.is_connected():
            self.connect()

        return Ecosystem.by_name(PostgresBase.session, name)
# rely on the digest of the npm downloaded tarball matching the upstream one.
# In that case we should probably consider downloading tarballs directly from registry.npmjs.org.
# because for example AnityaTask relies on this.
NPM_MODULE_DIGEST = '8db082250efa45673f344bb809c7cfa1ce37ca9274de29635a40d1e7df6d6114'
PYPI_MODULE_NAME = "six"
PYPI_MODULE_VERSION = "1.0.0"
PYPI_MODULE_DIGEST = 'ca79c14c8cb5e58912d185f0e07ca9c687e232b7c68c4b73bf1c83ef5979333e'
RUBYGEMS_MODULE_NAME = "permutation"
RUBYGEMS_MODULE_VERSION = "0.1.7"
RUBYGEMS_MODULE_DIGEST = 'e715cccaccb8e2d1450fbdda85bbe84963a32e9bf612db278cbb3d6781267638'
MAVEN_MODULE_NAME = "com.rabbitmq:amqp-client"
MAVEN_MODULE_VERSION = "3.6.1"
MAVEN_MODULE_DIGEST = 'cb6cdb7de8d37cb1b15b23867435c7dbbeaa1ca4b766f434138a8b9ef131994f'


npm = Ecosystem(name='npm', backend=EcosystemBackend.npm)
pypi = Ecosystem(name='pypi', backend=EcosystemBackend.pypi)
rubygems = Ecosystem(name='rubygems', backend=EcosystemBackend.rubygems)
maven = Ecosystem(name='maven', backend=EcosystemBackend.maven)


@pytest.fixture
def tmpdir():
    tmp = tempfile.mkdtemp()
    yield tmp
    shutil.rmtree(tmp)


def test_git_add_and_commit_everything_with_dotgit(tmpdir):
    # if there's a .git file somewhere in the archive, we don't want it to fail adding
    subprocess.check_output(['git', 'init', str(tmpdir)], universal_newlines=True)