def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) arguments.pop('document_id', None) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) url = self.get_upstream_url(arguments) upstream = self.get_upstream_entry(package, url) if upstream is None: upstream = self.add_or_update_upstream(package, url) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same # time as there is no lock, but let's say it's OK if upstream.updated_at is not None \ and datetime.datetime.utcnow() - upstream.updated_at < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - ' 'last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis( package_id=package.id, started_at=datetime.datetime.utcnow(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.utcnow() db.commit() arguments['document_id'] = package_analysis.id return arguments
def execute(self, arguments): self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) db = self.storage.session ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) upstream = self.get_upstream_entry(db, package, self.get_upstream_url(arguments)) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same # time as there is no lock, but let's say it's OK if upstream.updated_at is not None \ and upstream.updated_at - datetime.datetime.now() < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - ' 'last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis(package_id=package.id, started_at=datetime.datetime.now(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.now() db.commit() arguments['document_id'] = package_analysis.id return arguments
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('ecosystem')) # make sure we store package name based on ecosystem package naming case sensitivity arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name']) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): # TODO: this is OK for now, but if we will scale and there will be # 2+ workers running this task they can potentially schedule two # flows of a same type at the same time if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0: # we need to propagate flags that were passed to flow, but not # E/P/V - this way we are sure that for example graph import is # scheduled (arguments['force_graph_sync'] == True) arguments.pop('name') arguments.pop('version') arguments.pop('ecosystem') self.log.debug("Arguments returned by initAnalysisFlow without force: {}" .format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path ) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar(cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'. format(n=arguments.get('name'), v=arguments.get('version'), err=str(e)) ) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(isinstance(arguments.get('ecosystem'), str)) self._strict_assert(isinstance(arguments.get('name'), str)) self._strict_assert(isinstance(arguments.get('version'), str)) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) # make sure we store package name in its normalized form arguments['name'] = normalize_package_name(ecosystem.backend.name, arguments['name']) if len(pattern_ignore.findall(arguments['version'])) > 0: self.log.info("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) raise NotABugFatalTaskError("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) # Dont try ingestion for private packages if is_pkg_public(arguments['ecosystem'], arguments['name']): self.log.info("Ingestion flow for {} {}".format( arguments['ecosystem'], arguments['name'])) else: self.log.info("Private package ingestion ignored {} {}".format( arguments['ecosystem'], arguments['name'])) raise NotABugFatalTaskError("Private package alert {} {}".format( arguments['ecosystem'], arguments['name'])) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): if db.query(Analysis).filter( Analysis.version_id == v.id).count() > 0: arguments['analysis_already_exists'] = True self.log.debug( "Arguments returned by initAnalysisFlow without force: {}". format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) npm_dir = self.configuration.NPM_DATA_DIR try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar( cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}' .format(n=arguments.get('name'), v=arguments.get('version'), err=str(e))) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml( cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) if arguments['ecosystem'] == "npm": shutil.rmtree(npm_dir, True) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug( "Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments