def upload_from_string(self, s): """Utility method for uploading this blob; not used by the GoogleCloudStorage backend, but used to pre-populate the GCS mock for testing """ self.updated = utcnow() self._content = s self.bucket._upload_blob(self)
def _make_package(self, *args, **kwargs): """ Wrapper around make_package """ # Some SQL dbs are rounding the timestamps (looking at you MySQL >:| # which is a problem if they round UP to the future, as our # calculations depend on the timestamps being monotonically increasing. now = utcnow() - timedelta(seconds=1) kwargs.setdefault("last_modified", now) kwargs.setdefault("factory", SQLPackage) return make_package(*args, **kwargs)
def list_storage(factory): """ mocked method for listing storage packages """ # The first time we list from storage, concurrently "upload" # pkgs[2] if len(return_values) == 2: nowish = utcnow() + timedelta(seconds=1) pkg = self._make_package("mypkg3", last_modified=nowish) pkgs.append(pkg) self.db.save(pkg) return return_values.pop(0)()
def test_add_missing_more_recent(self): """ If we sync a more recent package, update the summary """ pkgs = [ self._make_package(last_modified=utcnow() - timedelta(hours=1)), self._make_package(version="1.5"), ] self.db.save(pkgs[0]) self.storage.list.return_value = pkgs self.db.reload_from_storage() all_pkgs = self.sql.query(SQLPackage).all() self.assertItemsEqual(all_pkgs, pkgs)
def make_package(name="mypkg", version="1.1", filename=None, last_modified=None, summary="summary", factory=Package, **kwargs): """Convenience method for constructing a package""" filename = filename or "%s-%s.tar.gz" % (name, version) return factory(name, version, filename, last_modified or utcnow(), summary, **kwargs)
def _generate_url(self, package): """ Get the fully-qualified CloudFront path for a package """ path = self.get_path(package) url = self.domain + "/" + quote(path) # No key id, no signer, so we don't have to sign the URL if self.cf_signer is None: return url # To sign with a canned policy: expires = utcnow() + timedelta(seconds=self.expire_after) return self.cf_signer.generate_presigned_url(url, date_less_than=expires)
def test_add_missing_more_recent(self): """ If we sync a more recent package, update the summary """ pkgs = [ make_package(last_modified=utcnow() - timedelta(hours=1)), make_package(version="1.5"), ] self.db.save(pkgs[0]) self.storage.list.return_value = pkgs self.db.reload_from_storage() all_pkgs = self.db._load_all_packages() self.assertItemsEqual(all_pkgs, pkgs) summaries = self.db.summary() self.assertEqual(len(summaries), 1) summary = summaries[0] self.assertEqual(summary["last_modified"].hour, pkgs[1].last_modified.hour)
def test_add_missing_more_recent(self): """If we sync a more recent package, update the summary""" pkgs = [ make_package( last_modified=utcnow() - timedelta(hours=1), factory=DynamoPackage, ), make_package(version="1.5", factory=DynamoPackage), ] self.db.save(pkgs[0]) self.storage.list.return_value = pkgs self.db.reload_from_storage() all_pkgs = self.engine.scan(DynamoPackage).all() self.assertCountEqual(all_pkgs, pkgs) summaries = self.db.summary() self.assertEqual(len(summaries), 1) summary = summaries[0] self.assertEqual(summary["last_modified"], pkgs[1].last_modified)
def reload_from_storage(self, clear=True): if not self.graceful_reload: return super(SQLCache, self).reload_from_storage(clear) LOG.info("Rebuilding cache from storage") # Log start time start = utcnow() # Fetch packages from storage s1 s1 = set(self.storage.list(SQLPackage)) # Fetch cache packages c1 c1 = set(self.db.query(SQLPackage).all()) # Add missing packages to cache (s1 - c1) missing = s1 - c1 if missing: LOG.info("Adding %d missing packages to cache", len(missing)) for pkg in missing: self.db.merge(pkg) # Delete extra packages from cache (c1 - s1) when last_modified < start # The time filter helps us avoid deleting packages that were # concurrently uploaded. extra1 = [p for p in (c1 - s1) if p.last_modified <= start] if extra1: LOG.info("Removing %d extra packages from cache", len(extra1)) for pkg in extra1: self.db.query(SQLPackage).filter( SQLPackage.filename == pkg.filename).delete( synchronize_session=False) # If any packages were concurrently deleted during the cache rebuild, # we can detect them by polling storage again and looking for any # packages that were present in s1 and are missing from s2 s2 = set(self.storage.list(SQLPackage)) # Delete extra packages from cache (s1 - s2) extra2 = s1 - s2 if extra2: LOG.info( "Removing %d packages from cache that were concurrently " "deleted during rebuild", len(extra2), ) for pkg in extra2: self.db.query(SQLPackage).filter( SQLPackage.filename == pkg.filename).delete( synchronize_session=False)
def handle_s3_event(event, context): """Handle S3 object notification""" from pypicloud.cache import get_cache_impl from pypicloud.dateutil import utcnow from pypicloud.storage.s3 import S3Storage from pypicloud.util import parse_filename settings = json.loads(os.environ["PYPICLOUD_SETTINGS"]) # Set 'file' storage as a hack. We're going to load the cache, which will # load a storage. We won't actually be using the storage for anything, but # the settings have to be present. settings.setdefault("pypi.storage", "file") settings.setdefault("storage.dir", "/tmp") cache_impl = get_cache_impl(settings) kwargs = cache_impl.configure(settings) cache = cache_impl(**kwargs) s3 = boto3.resource("s3") for record in event["Records"]: bucket = record["s3"]["bucket"]["name"] key = record["s3"]["object"]["key"] event_name = record["eventName"] if event_name.startswith("ObjectCreated"): print("S3 object %r created" % key) obj = s3.Object(bucket, key) package = S3Storage.package_from_object(obj, cache.new_package) existing_pkg = cache.fetch(package.filename) if existing_pkg is None: print("Saving package %s" % package) cache.save(package) else: print("Package already cached") else: print("S3 object %r deleted" % key) filename = posixpath.basename(key) try: name, version = parse_filename(filename) except ValueError: name = version = "dummy" package = cache.new_package(name, version, filename, utcnow(), "") print("Deleting package %s" % package) cache.clear(package)
def _generate_url(self, package: Package) -> str: path = self.get_path(package) url_params = generate_blob_sas( account_name=self.storage_account_name, container_name=self.storage_container_name, blob_name=path, account_key=self.storage_account_key, permission=BlobSasPermissions(read=True), expiry=utcnow() + timedelta(seconds=self.expire_after), protocol="https", ) url = "{}/{}/{}?{}".format( self.azure_storage_account_url, self.storage_container_name, path, url_params, ) return url
def reload_from_storage(self, clear=True): if not self.graceful_reload: return super(DynamoCache, self).reload_from_storage(clear) LOG.info("Rebuilding cache from storage") # Log start time start = utcnow() # Fetch packages from storage s1 s1 = set(self.storage.list(self.new_package)) # Fetch cache packages c1 c1 = set(self.engine.scan(DynamoPackage)) # Add missing packages to cache (s1 - c1) missing = s1 - c1 if missing: LOG.info("Adding %d missing packages to cache", len(missing)) self.engine.save(missing) # Delete extra packages from cache (c1 - s1) when last_modified < start # The time filter helps us avoid deleting packages that were # concurrently uploaded. extra1 = [p for p in (c1 - s1) if p.last_modified < start] if extra1: LOG.info("Removing %d extra packages from cache", len(extra1)) self.engine.delete(extra1) # If any packages were concurrently deleted during the cache rebuild, # we can detect them by polling storage again and looking for any # packages that were present in s1 and are missing from s2 s2 = set(self.storage.list(self.new_package)) # Delete extra packages from cache (s1 - s2) extra2 = s1 - s2 if extra2: LOG.info( "Removing %d packages from cache that were concurrently " "deleted during rebuild", len(extra2), ) self.engine.delete(extra2) # Remove these concurrently-deleted files from the list of packages # that were missing from the cache. Don't want to use those to # update the summaries below. missing -= extra2 # Update the PackageSummary for added packages packages_by_name = defaultdict(list) for package in missing: # Set the tz here so we can compare against the PackageSummary package.last_modified = package.last_modified.replace(tzinfo=UTC) packages_by_name[package.name].append(package) summaries = self.engine.get(PackageSummary, packages_by_name.keys()) summaries_by_name = {} for summary in summaries: summaries_by_name[summary.name] = summary for name, packages in packages_by_name.items(): if name in summaries_by_name: summary = summaries_by_name[name] else: summary = PackageSummary(packages[0]) summaries.append(summary) for package in packages: if package.last_modified > summary.last_modified: summary.last_modified = package.last_modified summary.summary = package.summary if summaries: LOG.info("Updating %d package summaries", len(summaries)) self.engine.save(summaries, overwrite=True) # Remove the PackageSummary for deleted packages removed = set() for package in extra1: removed.add(package.name) for package in extra2: removed.add(package.name) for name in removed: self._maybe_delete_summary(name)
def reload_from_storage(self, clear=True): if not self.graceful_reload: if clear: self.clear_all() packages = self.storage.list(self.new_package) pipe = self.db.pipeline() for pkg in packages: self.save(pkg, pipe=pipe) pipe.execute() return LOG.info("Rebuilding cache from storage") # Log start time start = utcnow() # Fetch packages from storage s1 s1 = set(self.storage.list(self.new_package)) # Fetch cache packages c1 c1 = set(self._load_all_packages()) # Add missing packages to cache (s1 - c1) missing = s1 - c1 if missing: LOG.info("Adding %d missing packages to cache", len(missing)) pipe = self.db.pipeline() for package in missing: self.save(package, pipe, save_summary=False) pipe.execute() # Delete extra packages from cache (c1 - s1) when last_modified < start # The time filter helps us avoid deleting packages that were # concurrently uploaded. extra1 = [p for p in (c1 - s1) if p.last_modified < start] if extra1: LOG.info("Removing %d extra packages from cache", len(extra1)) pipe = self.db.pipeline() for package in extra1: self._delete_package(package, pipe) pipe.execute() # If any packages were concurrently deleted during the cache rebuild, # we can detect them by polling storage again and looking for any # packages that were present in s1 and are missing from s2 s2 = set(self.storage.list(self.new_package)) # Delete extra packages from cache (s1 - s2) extra2 = s1 - s2 if extra2: LOG.info( "Removing %d packages from cache that were concurrently " "deleted during rebuild", len(extra2), ) pipe = self.db.pipeline() for package in extra2: self._delete_package(package, pipe) pipe.execute() # Remove these concurrently-deleted files from the list of packages # that were missing from the cache. Don't want to use those to # update the summaries below. missing -= extra2 # Update the summary for added packages packages_by_name = defaultdict(list) for package in missing: package.last_modified = package.last_modified packages_by_name[package.name].append(package) summaries = self._load_summaries(packages_by_name.keys()) summaries_by_name = {} for summary in summaries: summaries_by_name[summary["name"]] = summary for name, packages in packages_by_name.items(): if name in summaries_by_name: summary = summaries_by_name[name] else: summary = summary_from_package(packages[0]) summaries.append(summary) for package in packages: if package.last_modified > summary["last_modified"]: summary["last_modified"] = package.last_modified summary["summary"] = package.summary if summaries: LOG.info("Updating %d package summaries", len(summaries)) pipe = self.db.pipeline() for summary in summaries: self._save_summary(summary, pipe) pipe.execute() # Remove the PackageSummary for deleted packages removed = set() for package in extra1: removed.add(package.name) for package in extra2: removed.add(package.name) if removed: pipe = self.db.pipeline() for name in removed: pipe.scard(self.redis_filename_set(name)) counts = pipe.execute() pipe = self.db.pipeline() for name, count in zip(removed, counts): if count == 0: self._delete_summary(name, pipe) pipe.execute()