def analyses(app): e1 = Ecosystem(name='npm', backend=EcosystemBackend.npm) p1 = Package(ecosystem=e1, name='arrify') v1 = Version(package=p1, identifier='1.0.1') model1 = Analysis(version=v1, started_at=now, finished_at=later) app.rdb.session.add(model1) e2 = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) p2 = Package(ecosystem=e2, name='flexmock') v2 = Version(package=p2, identifier='0.10.1') model2 = Analysis(version=v2, started_at=later, access_count=1) app.rdb.session.add(model2) app.rdb.session.commit() worker_results2 = {'a': 'b', 'c': 'd', 'e': 'f', 'g': 'h', 'i': 'j', 'digests': {'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}} for w, tr in worker_results2.items(): app.rdb.session.add(WorkerResult(analysis_id=model2.id, worker=w, task_result=tr)) model3 = Analysis(version=v2, started_at=later, access_count=1, audit={'audit': {'audit': 'audit', 'e': 'f', 'g': 'h'}, 'a': 'b', 'c': 'd'}) app.rdb.session.add(model3) app.rdb.session.commit() worker_results3 = {'digests': {'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}} for w, tr in worker_results3.items(): app.rdb.session.add(WorkerResult(analysis_id=model3.id, worker=w, task_result=tr)) app.rdb.session.commit() return (model1, model2, model3)
def fill_analyses(app): ecosystems = [ Ecosystem(name='pypi', backend=EcosystemBackend.pypi, url='https://pypi.python.org/', fetch_url='https://pypi.python.org/pypi'), Ecosystem(name='npm', backend=EcosystemBackend.npm, url='https://www.npmjs.com/', fetch_url='https://registry.npmjs.org/'), Ecosystem(name='go', backend=EcosystemBackend.scm), ] packages = [ Package(name='flexmock', ecosystem=ecosystems[0]), Package(name='requests', ecosystem=ecosystems[0]), Package(name='sequence', ecosystem=ecosystems[1]), Package(name='arrify', ecosystem=ecosystems[1]), Package(name='serve-static', ecosystem=ecosystems[1]), ] versions = [ Version(identifier='0.10.1', package=packages[0]), Version(identifier='0.9.1', package=packages[0]), Version(identifier='2.0.0', package=packages[1]), Version(identifier='2.2.1', package=packages[2]), Version(identifier='1.0.1', package=packages[3]), Version(identifier='1.7.1', package=packages[4]), ] analyses = [ Analysis(version=versions[0], started_at=now), # pypi/flexmock/0.10.1 Analysis(version=versions[0], started_at=later, access_count=1), # pypi/flexmock/0.10.1 Analysis(version=versions[1], started_at=even_later), # pypi/flexmock/0.9.1 Analysis(version=versions[2], started_at=now), # pypi/requests/2.0.0 Analysis(version=versions[3], started_at=later), # npm/sequence/2.2.1 Analysis(version=versions[4], started_at=now, finished_at=later), # npm/arrify/1.0.1 Analysis(version=versions[5], started_at=now, finished_at=later, release='npm:serve-static:1.7.1'), # npm/serve-static/1.7.1 ] # worker results that correspond to analyses above worker_results = [ WorkerResult(worker='digests', analysis=analyses[1], task_result={'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}), WorkerResult(worker='static_analysis', task_result={'details': []}, analysis=analyses[1]), WorkerResult(worker='source_licenses', task_result={'schema': {'name': 'source_licenses', 'version': '1-0-0'}}, analysis=analyses[1]) ] package_gh_usage = [ PackageGHUsage(name='arrify', count=100, ecosystem_backend='npm') ] for a in ecosystems + packages + versions + analyses + worker_results + package_gh_usage: app.rdb.session.add(a) app.rdb.session.commit() return (ecosystems, packages, versions, analyses, worker_results, package_gh_usage)
def test_execute_with_mock_anitya(self, ecosystem, project, md5sum, dist_git): rdb() s = create_db_scoped_session() dummy_homepage = "http://project-homepage.com" dummy_response = Response() dummy_response.status_code = 200 s.add(Ecosystem(name='npm', backend=EcosystemBackend.npm)) s.commit() DownstreamMapCache( )[md5sum] = dist_git # fill in key-value mapping in cache task = AnityaTask.create_test_instance(task_name='anitya') args = {'ecosystem': ecosystem, 'name': project} flexmock(task).should_receive( "_get_project_homepage").once().and_return(dummy_homepage) flexmock(task).should_receive("_get_artifact_hash").once().and_return( md5sum) flexmock(task).should_receive( "_create_anitya_project").once().and_return(dummy_response) flexmock(task).should_receive( "_add_downstream_mapping").once().and_return(dummy_response) results = task.execute(arguments=args) assert results is None
def pypi(rdb): pypi = Ecosystem(name='pypi', backend=EcosystemBackend.pypi, fetch_url='https://pypi.python.org/pypi') rdb.add(pypi) rdb.commit() return pypi
def test_execute(self, tmpdir): artifact_digest, artifact_path = IndianaJones.fetch_artifact( Ecosystem(name='pypi', backend=EcosystemBackend.pypi), artifact=PYPI_MODULE_NAME, version=PYPI_MODULE_VERSION, target_dir=str(tmpdir)) args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value') flexmock(EPVCache).should_receive( 'get_extracted_source_tarball').and_return(str(tmpdir)) flexmock(EPVCache).should_receive('get_source_tarball').and_return( artifact_path) task = DigesterTask.create_test_instance(task_name='digests') results = task.execute(arguments=args) assert results is not None assert isinstance(results, dict) assert set(results.keys()) == {'details', 'status', 'summary'} artifact_details = None for details in results['details']: assert {'sha256', 'sha1', 'md5', 'ssdeep', 'path'}.issubset(set(details.keys())) if details.get('artifact'): artifact_details = details # there are artifact details assert artifact_details is not None # the artifact digest which Indy returns is the same as the one from DigesterTask assert artifact_digest == artifact_details['sha256'] == compute_digest( artifact_path) assert artifact_details['path'] == 'six-1.0.0.tar.gz'
def maven(rdb): maven = Ecosystem(name='maven', backend=EcosystemBackend.maven, fetch_url='') rdb.add(maven) rdb.commit() return maven
def npm(rdb): npm = Ecosystem(name='npm', backend=EcosystemBackend.npm, fetch_url='https://registry.npmjs.org/') rdb.add(npm) rdb.commit() return npm
def test_execute(self, tmpdir): npm = Ecosystem(name='npm', backend=EcosystemBackend.npm) flexmock(self.m.storage).should_receive('get_ecosystem').with_args( 'npm').and_return(npm) name = 'wrappy' version = '1.0.2' required = { 'homepage', 'version', 'declared_license', 'code_repository', 'bug_reporting', 'description', 'name', 'author' } IndianaJones.fetch_artifact(npm, artifact=name, version=version, target_dir=str(tmpdir)) args = {'ecosystem': npm.name, 'name': 'foo', 'version': 'bar'} flexmock(EPVCache).should_receive( 'get_extracted_source_tarball').and_return(str(tmpdir)) results = self.m.execute(arguments=args) assert results is not None assert isinstance(results, dict) details = results['details'][0] assert required.issubset(set( details.keys())) # check at least the required are there assert all([details[key] for key in list(required)]) # assert required are not None assert details['name'] == name
def rubygems(rdb): rubygems = Ecosystem(name='rubygems', backend=EcosystemBackend.rubygems, fetch_url='https://rubygems.org/api/v1') rdb.add(rubygems) rdb.commit() return rubygems
def fill_packages_for_paging(app, request): e = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) app.rdb.session.add(e) for p in range(0, 11): app.rdb.session.add(Package(ecosystem=e, name=str(p))) app.rdb.session.commit()
def get(self, ecosystem): query = request.args.get('q') eco = Ecosystem.by_name(rdb.session, ecosystem) fetcher = CucosReleasesFetcher(eco, rdb.session) now = datetime.datetime.now() # Instantiate two different solvers, one using a custom fetcher to fetch # matching releases from Bayesian DB and the other one fetching from # upstream repositories. # The data from these two solvers then provide information as to: # 1) Which packages in the range we have already analysed and have information # about # 2) Other packages from upstream repositories which match the version specification cucos_solver, solver = get_ecosystem_solver(eco, with_fetcher=fetcher),\ get_ecosystem_solver(eco) ours = cucos_solver.solve([query], all_versions=True) upstream = solver.solve([query], all_versions=True) ours_nums = set() if not ours else set(next(iter(ours.values()))) upstreams_nums = set() if not upstream else set( next(iter(upstream.values()))) return { 'query': query, 'detail': { 'analysed': ours, 'upstream': upstream, 'difference': list(upstreams_nums - ours_nums) }, 'resolved_at': str(now) }
def get_sources(self): """ :return: path to source files """ if not self._eco_obj: self._eco_obj = Ecosystem.by_name(self._postgres.session, self.ecosystem) if self._eco_obj.is_backed_by(EcosystemBackend.maven): return self.get_extracted_source_jar() else: return self.get_extracted_source_tarball()
def has_sources(self): """ :return: true if the given EPV has available sources """ if not self._eco_obj: self._eco_obj = Ecosystem.by_name(self._postgres.session, self.ecosystem) if self._eco_obj.is_backed_by(EcosystemBackend.maven): return self._s3.object_exists(self._source_jar_object_key) else: self._construct_source_tarball_names() return self._s3.object_exists(self._source_tarball_object_key)
def setup_method(self, method): rdb() self.s = create_db_scoped_session() self.en = 'foo' self.pn = 'bar' self.vi = '1.1.1' self.e = Ecosystem(name=self.en, backend=EcosystemBackend.maven) self.p = Package(ecosystem=self.e, name=self.pn) self.v = Version(package=self.p, identifier=self.vi) self.a = Analysis(version=self.v, finished_at=datetime.datetime.now()) self.a2 = Analysis(version=self.v, finished_at=datetime.datetime.now() + datetime.timedelta(seconds=10)) self.s.add(self.a) self.s.add(self.a2) self.s.commit() self.bp = BayesianPostgres( connection_string=get_postgres_connection_string())
def execute(self, arguments): self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) db = self.storage.session ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) upstream = self.get_upstream_entry(db, package, self.get_upstream_url(arguments)) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same time as there is no lock, # but let's say it's OK if upstream.updated_at is not None \ and upstream.updated_at - datetime.datetime.now() < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis(package_id=package.id, started_at=datetime.datetime.now(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.now() db.commit() arguments['document_id'] = package_analysis.id return arguments
def _make_ecosystem(name): return Ecosystem(name=name, backend=getattr(EcosystemBackend, name))
def execute(self, arguments): self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('ecosystem')) db = self.storage.session e = Ecosystem.by_name(db, arguments['ecosystem']) p = Package.get_or_create(db, ecosystem_id=e.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): # TODO: this is OK for now, but if we will scale and there will be 2+ workers running this task # they can potentially schedule two flows of a same type at the same time if db.query(Analysis).filter( Analysis.version_id == v.id).count() > 0: # we need to propagate flags that were passed to flow, but not E/P/V - this way we are sure that for # example graph import is scheduled (arguments['force_graph_sync'] == True) arguments.pop('name') arguments.pop('version') arguments.pop('ecosystem') return arguments cache_path = mkdtemp(dir=self.configuration.worker_data_dir) epv_cache = ObjectCache.get_from_dict(arguments) ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) try: if not epv_cache.has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar( cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{e}/{p}/{v}": {err}' .format(e=arguments.get('ecosystem'), p=arguments.get('name'), v=arguments.get('version'), err=str(e))) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml( cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.now()) db.add(a) db.commit() arguments['document_id'] = a.id return arguments
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] tool_responses = {} result_summary = { 'package_names': [], 'registered_srpms': [], 'all_rhn_channels': [], 'all_rhsm_content_sets': [], 'all_rhsm_product_names': [] } result_data = {'status': 'error', 'summary': result_summary, 'details': tool_responses } # bail out early; we need access to internal services or the package is from Maven ecosystem, # otherwise we can't comment on downstream usage is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven) if not self._is_inside_rh() and not is_maven: return result_data self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg)) res = self._fetch_anitya_project(eco, pkg) anitya_rpm_names = [] anitya_mvn_names = [] if res is None: result_data['status'] = 'error' elif res.status_code == 200: self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg)) anitya_response = res.json() tool_responses['redhat_anitya'] = anitya_response # For now, we assume all downstreams are ones we care about for entry in anitya_response['packages']: if entry['distro'] == RH_RPM_DISTRO_NAME: anitya_rpm_names.append(entry['package_name']) elif entry['distro'] == RH_MVN_DISTRO_NAME: anitya_mvn_names.append(entry['package_name']) else: self.log.warning( 'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'. format(d=entry['distro'], o=entry['package_name'], p=pkg) ) self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names)) self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names)) # TODO: Report 'partial' here and switch to 'success' at the end result_data['status'] = 'success' else: msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}' self.log.error(msg.format(e=eco, p=pkg, r=res.text)) result_data['status'] = 'error' if self._is_inside_rh(): # we have candidate downstream name mappings, check them against Brew seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)] self.log.debug('Checking candidate names in Brew: {}'.format(seed_names)) args = ['brew-utils-cli', '--version', arguments['version']] artifact_hash = self._get_artifact_hash(algorithm='sha256') if artifact_hash: args += ['--digest', artifact_hash] args += seed_names self.log.debug("Executing command, timeout={timeout}: {cmd}".format(timeout=self._BREWUTILS_CLI_TIMEOUT, cmd=args)) tc = TimedCommand(args) status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT) self.log.debug("status = %s, error = %s", status, error) output = ''.join(output) self.log.debug("output = %s", output) if not output: raise TaskError("Error running command %s" % args) brew = json.loads(output) result_summary['package_names'] = brew['packages'] result_summary['registered_srpms'] = brew['response']['registered_srpms'] tool_responses['brew'] = brew['response']['brew'] # we have SRPM details, fetch details on where the RPMs are shipped tool_responses['pulp_cdn'] = pulp_responses = [] rhn_channels = set() rhsm_content_sets = set() rhsm_product_names = set() for srpm_summary in result_summary['registered_srpms']: srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'], v=srpm_summary['version'], r=srpm_summary['release']) cdn_metadata = self._get_cdn_metadata(srpm_filename) if cdn_metadata is None: msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}' self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename)) continue pulp_responses.append(cdn_metadata) srpm_summary['published_in'] = cdn_metadata['rhsm_product_names'] rhn_channels.update(cdn_metadata['rhn_channels']) rhsm_content_sets.update(cdn_metadata['rhsm_content_sets']) rhsm_product_names.update(cdn_metadata['rhsm_product_names']) result_summary['all_rhn_channels'] = sorted(rhn_channels) result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets) result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names) self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version']) return result_data
# -*- coding: utf-8 -*- import pytest from flexmock import flexmock from cucoslib.object_cache import EPVCache from cucoslib.enums import EcosystemBackend from cucoslib.workers import LinguistTask from cucoslib.models import Ecosystem from cucoslib.process import IndianaJones ECOSYSTEM = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) MODULE_NAME = 'six' MODULE_VERSION = '1.10.0' @pytest.mark.usefixtures("dispatcher_setup") class TestLinguist(object): @pytest.mark.usefixtures("no_s3_connection") def test_execute(self, tmpdir): IndianaJones.fetch_artifact( ecosystem=ECOSYSTEM, artifact=MODULE_NAME, version=MODULE_VERSION, target_dir=str(tmpdir)) args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value') flexmock(EPVCache).should_receive('get_extracted_source_tarball').and_return(str(tmpdir)) task = LinguistTask.create_test_instance(task_name='languages') results = task.execute(args) assert results is not None assert isinstance(results, dict) assert set(results.keys()) == {'details', 'status', 'summary'} details = results['details']
def get_ecosystem(self, name): if not self.is_connected(): self.connect() return Ecosystem.by_name(PostgresBase.session, name)
# rely on the digest of the npm downloaded tarball matching the upstream one. # In that case we should probably consider downloading tarballs directly from registry.npmjs.org. # because for example AnityaTask relies on this. NPM_MODULE_DIGEST = '8db082250efa45673f344bb809c7cfa1ce37ca9274de29635a40d1e7df6d6114' PYPI_MODULE_NAME = "six" PYPI_MODULE_VERSION = "1.0.0" PYPI_MODULE_DIGEST = 'ca79c14c8cb5e58912d185f0e07ca9c687e232b7c68c4b73bf1c83ef5979333e' RUBYGEMS_MODULE_NAME = "permutation" RUBYGEMS_MODULE_VERSION = "0.1.7" RUBYGEMS_MODULE_DIGEST = 'e715cccaccb8e2d1450fbdda85bbe84963a32e9bf612db278cbb3d6781267638' MAVEN_MODULE_NAME = "com.rabbitmq:amqp-client" MAVEN_MODULE_VERSION = "3.6.1" MAVEN_MODULE_DIGEST = 'cb6cdb7de8d37cb1b15b23867435c7dbbeaa1ca4b766f434138a8b9ef131994f' npm = Ecosystem(name='npm', backend=EcosystemBackend.npm) pypi = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) rubygems = Ecosystem(name='rubygems', backend=EcosystemBackend.rubygems) maven = Ecosystem(name='maven', backend=EcosystemBackend.maven) @pytest.fixture def tmpdir(): tmp = tempfile.mkdtemp() yield tmp shutil.rmtree(tmp) def test_git_add_and_commit_everything_with_dotgit(tmpdir): # if there's a .git file somewhere in the archive, we don't want it to fail adding subprocess.check_output(['git', 'init', str(tmpdir)], universal_newlines=True)