def test_execute_npm(self, tmpdir, npm): """Test the MercatorTask for the NPM ecosystem.""" name = 'wrappy' version = '1.0.2' required = { 'homepage', 'version', 'declared_licenses', 'code_repository', 'bug_reporting', 'description', 'name', 'author' } IndianaJones.fetch_artifact(npm, artifact=name, version=version, target_dir=str(tmpdir)) args = {'ecosystem': npm.name, 'name': name, 'version': version} flexmock(EPVCache).should_receive( 'get_extracted_source_tarball').and_return(str(tmpdir)) results = self.m.execute(arguments=args) assert isinstance(results, dict) and results details = results['details'][0] assert set(details.keys() ) >= required # check at least the required are there assert all([details[key] for key in list(required)]) # assert required are not None assert details['name'] == name
def test_execute(self, tmpdir): """Start the LinguistTask worker task and check its results.""" # TODO: reduce cyclomatic complexity IndianaJones.fetch_artifact( ecosystem=ECOSYSTEM, artifact=MODULE_NAME, version=MODULE_VERSION, target_dir=str(tmpdir)) args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value') flexmock(EPVCache).should_receive('get_extracted_source_tarball').and_return(str(tmpdir)) task = LinguistTask.create_test_instance(task_name='languages') results = task.execute(args) assert results is not None assert isinstance(results, dict) assert set(results.keys()) == {'details', 'status', 'summary'} details = results['details'] assert len(details) > 3 # tarball, setup.py, LICENSE, README, etc. for f in details: if f.get('path') and f['path'].endswith('six.py'): # {'output': {'language': 'Python', # 'lines': '869', # 'mime': 'application/x-python', # 'sloc': '869', # 'type': 'Text'}, # 'path': 'six-1.10.0/six.py', # 'type': ['Python script, ASCII text executable']}, assert set(f.keys()) == {'output', 'path', 'type'} assert set(f['output'].keys()) == {'language', 'lines', 'mime', 'sloc', 'type'} assert f['output']['language'] == 'Python' assert f['type'].pop().startswith('Python') assert results['status'] == 'success'
def test_fetch_pypi_nonexistent(self, tmpdir, pypi): """Test fetching of a non-existent pypi artifact.""" tmpdir = Path(str(tmpdir)) with pytest.raises(NotABugTaskError): IndianaJones.fetch_artifact(pypi, artifact='not-in-pypi', version='1294839', target_dir=str(tmpdir))
def test_execute(self, tmpdir): artifact_digest, artifact_path = IndianaJones.fetch_artifact( Ecosystem(name='pypi', backend=EcosystemBackend.pypi), artifact=PYPI_MODULE_NAME, version=PYPI_MODULE_VERSION, target_dir=str(tmpdir)) args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value') # flexmock(EPVCache).should_receive('get_extracted_source_tarball').and_return(str(tmpdir)) flexmock(EPVCache).should_receive('get_source_tarball').and_return( artifact_path) task = DigesterTask.create_test_instance(task_name='digests') results = task.execute(arguments=args) assert results is not None assert isinstance(results, dict) assert set(results.keys()) == {'details', 'status', 'summary'} artifact_details = None for details in results['details']: assert {'sha256', 'sha1', 'md5', 'ssdeep', 'path'}.issubset(set(details.keys())) if details.get('artifact'): artifact_details = details # there are artifact details assert artifact_details is not None # the artifact digest which Indy returns is the same as the one from DigesterTask assert artifact_digest == artifact_details['sha256'] == compute_digest( artifact_path) assert artifact_details['path'] == 'six-1.0.0.tar.gz'
def test_fetch_rubygems_specific(self, tmpdir, rubygems, name, version, expected_digest): digest, path = IndianaJones.fetch_artifact(rubygems, artifact=name, version=version, target_dir=str(tmpdir)) assert digest == expected_digest assert path.endswith("{}-{}.gem".format(name, version)) assert osp.exists(path)
def test_fetch_go_specific(self, tmpdir, go, name, version, expected_digest): digest, path = IndianaJones.fetch_artifact(go, artifact=name, version=version, target_dir=str(tmpdir)) assert digest == expected_digest assert path.endswith('{}.tar.gz'.format(version)) assert osp.exists(path)
def test_fetch_nuget_specific(self, tmpdir, nuget, name, version, expected_digest): digest, path = IndianaJones.fetch_artifact(nuget, artifact=name, version=version, target_dir=str(tmpdir)) assert digest == expected_digest assert path.endswith('{}.{}.nupkg'.format(name.lower(), version)) assert osp.exists(path)
def test_fetch_version_range_npm_specific(self, tmpdir, npm, name, version, expected_digest): """Test fetching of npm artifact with version range.""" with pytest.raises(NotABugTaskError) as excinfo: cache_path = Path( subprocess.check_output(["npm", "config", "get", "cache"], universal_newlines=True).strip()) package_digest, path = IndianaJones.fetch_artifact( npm, artifact=name, version=version, target_dir=str(tmpdir))
def test_fetch_maven_specific(self, tmpdir, maven, name, version, expected_digest): digest, path = IndianaJones.fetch_artifact(maven, artifact=name, version=version, target_dir=str(tmpdir)) _, artifactId = name.split(':', 1) assert digest == expected_digest assert path.endswith('{}-{}.jar'.format(artifactId, version)) assert osp.exists(path)
def test_fetch_rubygems_specific(self, tmpdir, rubygems, name, version, expected_digest): """Test fetching of rubygems artifact.""" digest, path = IndianaJones.fetch_artifact(rubygems, artifact=name, version=version, target_dir=str(tmpdir)) assert digest == expected_digest path = Path(path) assert path.name == "{}-{}.gem".format(name, version) assert path.exists()
def test_fetch_go_specific(self, tmpdir, go, name, version, expected_digest): """Test fetching of go artifact.""" digest, path = IndianaJones.fetch_artifact(go, artifact=name, version=version, target_dir=str(tmpdir)) assert digest == expected_digest path = Path(path) assert path.name == '{}.tar.gz'.format(version) assert path.exists()
def test_fetch_nuget_specific(self, tmpdir, nuget, name, version, expected_digest): """Test fetching of nuget artifact.""" digest, path = IndianaJones.fetch_artifact(nuget, artifact=name, version=version, target_dir=str(tmpdir)) assert digest == expected_digest path = Path(path) assert path.name == '{}.{}.nupkg'.format(name.lower(), version) assert path.exists()
def test_fetch_pypi_specific(self, tmpdir, pypi, name, version, expected_digest): digest, path = IndianaJones.fetch_artifact(pypi, artifact=name, version=version, target_dir=str(tmpdir)) assert digest == expected_digest assert len(os.listdir(str(tmpdir))) > 1 glob_whl_path = glob.glob( osp.join(str(tmpdir), "{}-{}*".format(name, version))).pop() assert osp.exists(glob_whl_path)
def _download_pom_xml(target, ecosystem, arguments): artifact_coords = MavenCoordinates.from_str(arguments['name']) artifact_coords.packaging = 'pom' artifact_coords.classifier = '' # pom.xml files have no classifiers IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=artifact_coords.to_str(omit_version=True), version=arguments['version'], target_dir=target) # pom has to be named precisely pom.xml, otherwise mercator's Java handler # which uses maven as subprocess won't see it pom_xml_path = os.path.join(target, 'pom.xml') os.rename( os.path.join( target, '{}-{}.pom'.format(artifact_coords.artifactId, arguments['version'])), pom_xml_path) return pom_xml_path
def test_fetch_pypi_specific(self, tmpdir, pypi, name, version, expected_digest): """Test fetching of pypi artifact.""" tmpdir = Path(str(tmpdir)) digest, path = IndianaJones.fetch_artifact(pypi, artifact=name, version=version, target_dir=str(tmpdir)) assert digest == expected_digest assert len(list(tmpdir.iterdir())) > 1 glob_whl_path = next(tmpdir.glob("{}-{}*".format(name, version))) assert glob_whl_path.exists()
def test_fetch_maven_specific(self, tmpdir, maven, name, version, expected_digest): """Test fetching of maven artifact.""" digest, path = IndianaJones.fetch_artifact(maven, artifact=name, version=version, target_dir=str(tmpdir)) _, artifactId = name.split(':', 1) assert digest == expected_digest path = Path(path) assert path.name == '{}-{}.jar'.format(artifactId, version) assert path.exists()
def test_fetch_npm_specific(self, tmpdir, npm, name, version, expected_digest): cache_path = subprocess.check_output(["npm", "config", "get", "cache"], universal_newlines=True).strip() assert ".npm" in cache_path package_digest, path = IndianaJones.fetch_artifact( npm, artifact=name, version=version, target_dir=str(tmpdir)) assert len(glob.glob(osp.join(cache_path, name, "*"))) == 1,\ "there should be just one version of the artifact in the NPM cache" assert package_digest == expected_digest assert osp.exists(path) assert osp.exists(osp.join(osp.join(cache_path, name), version)) assert osp.exists(osp.join(str(tmpdir), "package.tgz"))
def test_fetch_npm_specific(self, tmpdir, npm, name, version, expected_digest): """Test fetching of npm artifact.""" cache_path = Path( subprocess.check_output(["npm", "config", "get", "cache"], universal_newlines=True).strip()) assert cache_path.name == ".npm" package_digest, path = IndianaJones.fetch_artifact( npm, artifact=name, version=version, target_dir=str(tmpdir)) assert len(list((cache_path / name).glob('*'))) == 1,\ "there should be just one version of the artifact in the NPM cache" assert package_digest == expected_digest assert Path(path).exists() assert (cache_path / name / version).exists() assert Path(str(tmpdir / "package.tgz")).exists()
def _transform_licenses(self): if self._raw_data.get('LicenseUrl'): from f8a_worker.process import IndianaJones # download_file # It's here due to circular dependencies from f8a_worker.workers import LicenseCheckTask # run_scancode self._data['declared_licenses'] = [self._raw_data['LicenseUrl']] with TemporaryDirectory() as tmpdir: try: # Get file from 'LicenseUrl' and let LicenseCheckTask decide what license it is if IndianaJones.download_file(self._raw_data['LicenseUrl'], tmpdir): scancode_results = LicenseCheckTask.run_scancode( tmpdir) if scancode_results.get('summary', {}).get('sure_licenses'): self._data['declared_licenses'] = \ scancode_results['summary']['sure_licenses'] except Exception: # Don't raise if IndianaJones or LicenseCheckTask fail pass
def _download_source_jar(target, ecosystem, arguments): artifact_coords = MavenCoordinates.from_str(arguments['name']) artifact_coords.packaging = 'jar' # source is always jar even for war/aar etc. sources_classifiers = ['sources', 'src'] if artifact_coords.classifier not in sources_classifiers: for sources_classifier in sources_classifiers: artifact_coords.classifier = sources_classifier try: _, source_jar_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=artifact_coords.to_str(omit_version=True), version=arguments['version'], target_dir=target) except Exception: if sources_classifier == sources_classifiers[-1]: # fetching of all variants failed raise else: return source_jar_path
def _handle_dotnet_solution(self, data): """Handle nuget package metadata.""" # TODO: reduce cyclomatic complexity if not data.get('Metadata'): return {} data = data['Metadata'] key_map = ( ('Id', 'name'), ('Description', ), ('ProjectUrl', 'homepage'), # ('Summary',), ('Copyright',), # ('RequireLicenseAcceptance', 'require_license_acceptance'), ) transformed = self.transform_keys(data, key_map) if data.get('Authors'): transformed['author'] = ','.join(data['Authors']) if data.get('LicenseUrl'): from f8a_worker.process import IndianaJones # download_file # It's here due to circular dependencies from f8a_worker.workers import LicenseCheckTask # run_scancode transformed['declared_licenses'] = [data['LicenseUrl']] with TemporaryDirectory() as tmpdir: try: # Get file from 'LicenseUrl' and let LicenseCheckTask decide what license it is if IndianaJones.download_file(data['LicenseUrl'], tmpdir): scancode_results = LicenseCheckTask.run_scancode( tmpdir) if scancode_results.get('summary', {}).get('sure_licenses'): transformed['declared_licenses'] = \ scancode_results['summary']['sure_licenses'] except Exception: # Don't raise if IndianaJones or LicenseCheckTask fail pass # transform # "DependencyGroups": [ # { # "Packages": [ # { # "Id": "NETStandard.Library", # "VersionRange": {"OriginalString": "1.6.0"} # } # ] # } # ] # to ["NETStandard.Library 1.6.0"] deps = set() for dep_group in data.get('DependencyGroups', []): for package in dep_group.get('Packages', []): deps.add('{} {}'.format( package.get('Id', ''), package.get('VersionRange', {}).get('OriginalString', ''))) if deps: transformed['dependencies'] = list(deps) repository = data.get('Repository') if isinstance(repository, dict) and repository: transformed['code_repository'] = { 'type': repository.get('Type'), 'url': repository.get('Url') } elif 'ProjectUrl' in data: transformed['code_repository'] = self._identify_gh_repo( data['ProjectUrl']) version = data.get('Version') if isinstance(version, dict) and version: transformed['version'] = '{}.{}.{}'.format( version.get('Major', ''), version.get('Minor', ''), version.get('Patch', '')) if data.get('Tags'): transformed['keywords'] = self._split_keywords(data['Tags']) return transformed
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('ecosystem')) # make sure we store package name based on ecosystem package naming case sensitivity arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name']) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): # TODO: this is OK for now, but if we will scale and there will be # 2+ workers running this task they can potentially schedule two # flows of a same type at the same time if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0: # we need to propagate flags that were passed to flow, but not # E/P/V - this way we are sure that for example graph import is # scheduled (arguments['force_graph_sync'] == True) arguments.pop('name') arguments.pop('version') arguments.pop('ecosystem') self.log.debug("Arguments returned by initAnalysisFlow without force: {}" .format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path ) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar(cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'. format(n=arguments.get('name'), v=arguments.get('version'), err=str(e)) ) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(isinstance(arguments.get('ecosystem'), str)) self._strict_assert(isinstance(arguments.get('name'), str)) self._strict_assert(isinstance(arguments.get('version'), str)) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) # make sure we store package name in its normalized form arguments['name'] = normalize_package_name(ecosystem.backend.name, arguments['name']) if len(pattern_ignore.findall(arguments['version'])) > 0: self.log.info("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) raise NotABugFatalTaskError("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) # Dont try ingestion for private packages if is_pkg_public(arguments['ecosystem'], arguments['name']): self.log.info("Ingestion flow for {} {}".format( arguments['ecosystem'], arguments['name'])) else: self.log.info("Private package ingestion ignored {} {}".format( arguments['ecosystem'], arguments['name'])) raise NotABugFatalTaskError("Private package alert {} {}".format( arguments['ecosystem'], arguments['name'])) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): if db.query(Analysis).filter( Analysis.version_id == v.id).count() > 0: arguments['analysis_already_exists'] = True self.log.debug( "Arguments returned by initAnalysisFlow without force: {}". format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) npm_dir = self.configuration.NPM_DATA_DIR try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar( cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}' .format(n=arguments.get('name'), v=arguments.get('version'), err=str(e))) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml( cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) if arguments['ecosystem'] == "npm": shutil.rmtree(npm_dir, True) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug( "Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments