class AzureStorageContainer(StorageContainer): """Azure implementation of Storage Container using BlockBlobService.""" def __init__(self, container_name, config: AzureStorageConfig): self._blob_service = None self._container_name = container_name self._config = config def _get_client(self): """ :return: BlockBlobService initialized with account name and key from config """ if self._blob_service is None: self._blob_service = BlockBlobService( account_name=self._config.account_name, account_key=self._config.account_key ) self._blob_service.create_container(self._container_name) return self._blob_service def upload_text(self, blob_name, text): """Uploads text to a new blob. :param blob_name: Name to give new blob :param text: Text to upload :return: None """ self._get_client().create_blob_from_text(self._container_name, blob_name, text) def list_blobs(self): """List all blobs in container. :return: List of blobs in container """ return self._get_client().list_blobs(self._container_name) def get_blob_to_text(self, file_name): """Get string from contents of blob. :param file_name: Name of blob file :return: Text from blob file """ return self._get_client().get_blob_to_text(self._container_name, file_name) @staticmethod def create(): """Initialize AzureStorageContainer with name and creds from config. :return: """ return AzureStorageContainer( ProcessConfig().config_container_name, AzureConfig().storage_config)
def block_blob_service(self): ACCOUNT_NAME = os.environ['ACCOUNT_NAME'] ACCOUNT_KEY = os.environ['ACCOUNT_KEY'] block_blob_service = BlockBlobService(account_name=ACCOUNT_NAME, account_key=ACCOUNT_KEY) block_blob_service.create_container(self.container_name) block_blob_service.set_container_acl( self.container_name, public_access=PublicAccess.Container) return block_blob_service
class AzureStorageContainer(Common.Contracts.StorageContainer): def __init__(self, container_name, config: AzureStorageConfig): self._container_name = container_name self._blob_service = BlockBlobService( account_name=config.account_name, account_key=config.account_key ) self._blob_service.create_container(self._container_name) def upload_text(self, blob_name, text): self._blob_service.create_blob_from_text(self._container_name, blob_name, text) def list_blobs(self): return self._blob_service.list_blobs(self._container_name) def get_blob_to_text(self, file_name): return self._blob_service.get_blob_to_text(self._container_name, file_name)
class AzureJobStore(AbstractJobStore): """ A job store that uses Azure's blob store for file storage and Table Service to store job info with strong consistency. """ # Dots in container names should be avoided because container names are used in HTTPS bucket # URLs where the may interfere with the certificate common name. We use a double underscore # as a separator instead. # containerNameRe = re.compile(r'^[a-z0-9][a-z0-9-]+[a-z0-9]$') # See https://msdn.microsoft.com/en-us/library/azure/dd135715.aspx # minContainerNameLen = 3 maxContainerNameLen = 63 maxNameLen = 10 nameSeparator = 'xx' # Table names must be alphanumeric # Length of a jobID - used to test if a stats file has been read already or not jobIDLength = len(str(uuid.uuid4())) def __init__(self, locator, jobChunkSize=maxAzureTablePropertySize): super(AzureJobStore, self).__init__() accountName, namePrefix = locator.split(':', 1) if '--' in namePrefix: raise ValueError("Invalid name prefix '%s'. Name prefixes may not contain %s." % (namePrefix, self.nameSeparator)) if not self.containerNameRe.match(namePrefix): raise ValueError("Invalid name prefix '%s'. Name prefixes must contain only digits, " "hyphens or lower-case letters and must not start or end in a " "hyphen." % namePrefix) # Reserve 13 for separator and suffix if len(namePrefix) > self.maxContainerNameLen - self.maxNameLen - len(self.nameSeparator): raise ValueError(("Invalid name prefix '%s'. Name prefixes may not be longer than 50 " "characters." % namePrefix)) if '--' in namePrefix: raise ValueError("Invalid name prefix '%s'. Name prefixes may not contain " "%s." % (namePrefix, self.nameSeparator)) self.locator = locator self.jobChunkSize = jobChunkSize self.accountKey = _fetchAzureAccountKey(accountName) self.accountName = accountName # Table names have strict requirements in Azure self.namePrefix = self._sanitizeTableName(namePrefix) # These are the main API entry points. self.tableService = TableService(account_key=self.accountKey, account_name=accountName) self.blobService = BlockBlobService(account_key=self.accountKey, account_name=accountName) # Serialized jobs table self.jobItems = None # Job<->file mapping table self.jobFileIDs = None # Container for all shared and unshared files self.files = None # Stats and logging strings self.statsFiles = None # File IDs that contain stats and logging strings self.statsFileIDs = None @property def keyPath(self): return self.config.cseKey def initialize(self, config): if self._jobStoreExists(): raise JobStoreExistsException(self.locator) logger.debug("Creating job store at '%s'" % self.locator) self._bind(create=True) super(AzureJobStore, self).initialize(config) def resume(self): if not self._jobStoreExists(): raise NoSuchJobStoreException(self.locator) logger.debug("Using existing job store at '%s'" % self.locator) self._bind(create=False) super(AzureJobStore, self).resume() def destroy(self): self._bind() for name in 'jobItems', 'jobFileIDs', 'files', 'statsFiles', 'statsFileIDs': resource = getattr(self, name) if resource is not None: if isinstance(resource, AzureTable): resource.delete_table() elif isinstance(resource, AzureBlobContainer): resource.delete_container() else: assert False setattr(self, name, None) def _jobStoreExists(self): """ Checks if job store exists by querying the existence of the statsFileIDs table. Note that this is the last component that is deleted in :meth:`.destroy`. """ for attempt in retry_azure(): with attempt: try: exists = self.tableService.exists(table_name=self._qualify('statsFileIDs')) except AzureMissingResourceHttpError as e: if e.status_code == 404: return False else: raise else: return exists def _bind(self, create=False): table = self._bindTable container = self._bindContainer for name, binder in (('jobItems', table), ('jobFileIDs', table), ('files', container), ('statsFiles', container), ('statsFileIDs', table)): if getattr(self, name) is None: setattr(self, name, binder(self._qualify(name), create=create)) def _qualify(self, name): return self.namePrefix + self.nameSeparator + name.lower() def jobs(self): # How many jobs have we done? total_processed = 0 for jobEntity in self.jobItems.query_entities(): # Process the items in the page yield AzureJob.fromEntity(jobEntity) total_processed += 1 if total_processed % 1000 == 0: # Produce some feedback for the user, because this can take # a long time on, for example, Azure logger.debug("Processed %d total jobs" % total_processed) logger.debug("Processed %d total jobs" % total_processed) def create(self, jobNode): jobStoreID = self._newJobID() job = AzureJob.fromJobNode(jobNode, jobStoreID, self._defaultTryCount()) entity = job.toEntity(chunkSize=self.jobChunkSize) self.jobItems.insert_entity(entity=entity) return job def exists(self, jobStoreID): if self.jobItems.get_entity(row_key=str(jobStoreID)) is None: return False return True def load(self, jobStoreID): jobEntity = self.jobItems.get_entity(row_key=str(jobStoreID)) if jobEntity is None: raise NoSuchJobException(jobStoreID) return AzureJob.fromEntity(jobEntity) def update(self, job): self.jobItems.update_entity(entity=job.toEntity(chunkSize=self.jobChunkSize)) def delete(self, jobStoreID): try: self.jobItems.delete_entity(row_key=str(jobStoreID)) except AzureMissingResourceHttpError: # Job deletion is idempotent, and this job has been deleted already return filterString = "PartitionKey eq '%s'" % jobStoreID for fileEntity in self.jobFileIDs.query_entities(filter=filterString): jobStoreFileID = fileEntity.RowKey self.deleteFile(jobStoreFileID) def getEnv(self): return dict(AZURE_ACCOUNT_KEY=self.accountKey) class BlobInfo(namedtuple('BlobInfo', ('account', 'container', 'name'))): @property @memoize def service(self): return BlockBlobService(account_name=self.account, account_key=_fetchAzureAccountKey(self.account)) @classmethod def getSize(cls, url): blob = cls._parseWasbUrl(url) blob = blob.service.get_blob_properties(blob.container, blob.name) return blob.properties.content_length @classmethod def _readFromUrl(cls, url, writable): blob = cls._parseWasbUrl(url) for attempt in retry_azure(): with attempt: blob.service.get_blob_to_stream(container_name=blob.container, blob_name=blob.name, stream=writable) @classmethod def _writeToUrl(cls, readable, url): blob = cls._parseWasbUrl(url) blob.service.create_blob_from_stream(container_name=blob.container, blob_name=blob.name, max_connections=1, stream=readable) @classmethod def _parseWasbUrl(cls, url): """ :param urlparse.ParseResult url: x :rtype: AzureJobStore.BlobInfo """ assert url.scheme in ('wasb', 'wasbs') try: container, account = url.netloc.split('@') except ValueError: raise InvalidImportExportUrlException(url) suffix = '.blob.core.windows.net' if account.endswith(suffix): account = account[:-len(suffix)] else: raise InvalidImportExportUrlException(url) assert url.path[0] == '/' return cls.BlobInfo(account=account, container=container, name=url.path[1:]) @classmethod def _supportsUrl(cls, url, export=False): return url.scheme.lower() in ('wasb', 'wasbs') def writeFile(self, localFilePath, jobStoreID=None): jobStoreFileID = self._newFileID() self.updateFile(jobStoreFileID, localFilePath) self._associateFileWithJob(jobStoreFileID, jobStoreID) return jobStoreFileID def updateFile(self, jobStoreFileID, localFilePath): with open(localFilePath, 'rb') as read_fd: with self._uploadStream(jobStoreFileID, self.files) as write_fd: while True: buf = read_fd.read(self._maxAzureBlockBytes) write_fd.write(buf) if len(buf) == 0: break def readFile(self, jobStoreFileID, localFilePath, symlink=False): try: with self._downloadStream(jobStoreFileID, self.files) as read_fd: with open(localFilePath, 'wb') as write_fd: while True: buf = read_fd.read(self._maxAzureBlockBytes) write_fd.write(buf) if not buf: break except AzureMissingResourceHttpError: raise NoSuchFileException(jobStoreFileID) def deleteFile(self, jobStoreFileID): try: self.files.delete_blob(blob_name=str(jobStoreFileID)) self._dissociateFileFromJob(jobStoreFileID) except AzureMissingResourceHttpError: pass def fileExists(self, jobStoreFileID): # As Azure doesn't have a blob_exists method (at least in the # python API) we just try to download the metadata, and hope # the metadata is small so the call will be fast. try: self.files.get_blob_metadata(blob_name=str(jobStoreFileID)) return True except AzureMissingResourceHttpError: return False @contextmanager def writeFileStream(self, jobStoreID=None): # TODO: this (and all stream methods) should probably use the # Append Blob type, but that is not currently supported by the # Azure Python API. jobStoreFileID = self._newFileID() with self._uploadStream(jobStoreFileID, self.files) as fd: yield fd, jobStoreFileID self._associateFileWithJob(jobStoreFileID, jobStoreID) @contextmanager def updateFileStream(self, jobStoreFileID): with self._uploadStream(jobStoreFileID, self.files, checkForModification=True) as fd: yield fd def getEmptyFileStoreID(self, jobStoreID=None): jobStoreFileID = self._newFileID() with self._uploadStream(jobStoreFileID, self.files) as _: pass self._associateFileWithJob(jobStoreFileID, jobStoreID) return jobStoreFileID @contextmanager def readFileStream(self, jobStoreFileID): if not self.fileExists(jobStoreFileID): raise NoSuchFileException(jobStoreFileID) with self._downloadStream(jobStoreFileID, self.files) as fd: yield fd @contextmanager def writeSharedFileStream(self, sharedFileName, isProtected=None): assert self._validateSharedFileName(sharedFileName) sharedFileID = self._newFileID(sharedFileName) with self._uploadStream(sharedFileID, self.files, encrypted=isProtected) as fd: yield fd @contextmanager def readSharedFileStream(self, sharedFileName): assert self._validateSharedFileName(sharedFileName) sharedFileID = self._newFileID(sharedFileName) if not self.fileExists(sharedFileID): raise NoSuchFileException(sharedFileID) with self._downloadStream(sharedFileID, self.files) as fd: yield fd def writeStatsAndLogging(self, statsAndLoggingString): # TODO: would be a great use case for the append blobs, once implemented in the Azure SDK jobStoreFileID = self._newFileID() encrypted = self.keyPath is not None if encrypted: statsAndLoggingString = encryption.encrypt(statsAndLoggingString, self.keyPath) self.statsFiles.create_blob_from_text(blob_name=str(jobStoreFileID), text=statsAndLoggingString, metadata=dict(encrypted=str(encrypted))) self.statsFileIDs.insert_entity(entity={'RowKey': jobStoreFileID}) def readStatsAndLogging(self, callback, readAll=False): suffix = '_old' numStatsFiles = 0 for attempt in retry_azure(): with attempt: for entity in self.statsFileIDs.query_entities(): jobStoreFileID = entity.RowKey hasBeenRead = len(jobStoreFileID) > self.jobIDLength if not hasBeenRead: with self._downloadStream(jobStoreFileID, self.statsFiles) as fd: callback(fd) # Mark this entity as read by appending the suffix self.statsFileIDs.insert_entity(entity={'RowKey': jobStoreFileID + suffix}) self.statsFileIDs.delete_entity(row_key=str(jobStoreFileID)) numStatsFiles += 1 elif readAll: # Strip the suffix to get the original ID jobStoreFileID = jobStoreFileID[:-len(suffix)] with self._downloadStream(jobStoreFileID, self.statsFiles) as fd: callback(fd) numStatsFiles += 1 return numStatsFiles _azureTimeFormat = "%Y-%m-%dT%H:%M:%SZ" def getPublicUrl(self, jobStoreFileID): try: self.files.get_blob_properties(blob_name=str(jobStoreFileID)) except AzureMissingResourceHttpError: raise NoSuchFileException(jobStoreFileID) startTime = (datetime.utcnow() - timedelta(minutes=5)) endTime = datetime.utcnow() + self.publicUrlExpiration sas_token = self.files.generate_blob_shared_access_signature(blob_name=str(jobStoreFileID), permission=BlobPermissions.READ, start=startTime, expiry=endTime) return self.files.make_blob_url(blob_name=str(jobStoreFileID)) + '?' + sas_token def getSharedPublicUrl(self, sharedFileName): jobStoreFileID = self._newFileID(sharedFileName) return self.getPublicUrl(jobStoreFileID) def _newJobID(self): # raw UUIDs don't work for Azure property names because the '-' character is disallowed. return str(uuid.uuid4()).replace('-', '_') # A dummy job ID under which all shared files are stored. sharedFileJobID = uuid.UUID('891f7db6-e4d9-4221-a58e-ab6cc4395f94') def _newFileID(self, sharedFileName=None): if sharedFileName is None: ret = str(uuid.uuid4()) else: ret = str(uuid.uuid5(self.sharedFileJobID, sharedFileName)) return ret.replace('-', '_') def _associateFileWithJob(self, jobStoreFileID, jobStoreID=None): if jobStoreID is not None: self.jobFileIDs.insert_entity(entity={'PartitionKey': EntityProperty('Edm.String', jobStoreID), 'RowKey': EntityProperty('Edm.String', jobStoreFileID)}) def _dissociateFileFromJob(self, jobStoreFileID): entities = list(self.jobFileIDs.query_entities(filter="RowKey eq '%s'" % jobStoreFileID)) if entities: assert len(entities) == 1 jobStoreID = entities[0].PartitionKey self.jobFileIDs.delete_entity(partition_key=str(jobStoreID), row_key=str(jobStoreFileID)) def _bindTable(self, tableName, create=False): for attempt in retry_azure(): with attempt: try: exists = self.tableService.exists(table_name=tableName) except AzureMissingResourceHttpError as e: if e.status_code != 404: raise else: if exists: return AzureTable(self.tableService, tableName) if create: self.tableService.create_table(tableName) return AzureTable(self.tableService, tableName) else: return None def _bindContainer(self, containerName, create=False): for attempt in retry_azure(): with attempt: try: self.blobService.get_container_properties(containerName) except AzureMissingResourceHttpError as e: if e.status_code == 404: if create: self.blobService.create_container(containerName) else: return None else: raise return AzureBlobContainer(self.blobService, containerName) def _sanitizeTableName(self, tableName): """ Azure table names must start with a letter and be alphanumeric. This will never cause a collision if uuids are used, but otherwise may not be safe. """ return 'a' + ''.join([x for x in tableName if x.isalnum()]) # Maximum bytes that can be in any block of an Azure block blob # https://github.com/Azure/azure-storage-python/blob/4c7666e05a9556c10154508335738ee44d7cb104/azure/storage/blob/blobservice.py#L106 _maxAzureBlockBytes = 4 * 1024 * 1024 @contextmanager def _uploadStream(self, jobStoreFileID, container, checkForModification=False, encrypted=None): """ :param encrypted: True to enforce encryption (will raise exception unless key is set), False to prevent encryption or None to encrypt if key is set. """ if checkForModification: try: expectedVersion = container.get_blob_properties(blob_name=str(jobStoreFileID)).properties.etag except AzureMissingResourceHttpError: expectedVersion = None if encrypted is None: encrypted = self.keyPath is not None elif encrypted: if self.keyPath is None: raise RuntimeError('Encryption requested but no key was provided') maxBlockSize = self._maxAzureBlockBytes if encrypted: # There is a small overhead for encrypted data. maxBlockSize -= encryption.overhead store = self class UploadPipe(WritablePipe): def readFrom(self, readable): blocks = [] try: while True: buf = readable.read(maxBlockSize) if len(buf) == 0: # We're safe to break here even if we never read anything, since # putting an empty block list creates an empty blob. break if encrypted: buf = encryption.encrypt(buf, store.keyPath) blockID = store._newFileID() container.put_block(blob_name=str(jobStoreFileID), block=buf, block_id=blockID) blocks.append(BlobBlock(blockID)) except: with panic(log=logger): # This is guaranteed to delete any uncommitted blocks. container.delete_blob(blob_name=str(jobStoreFileID)) if checkForModification and expectedVersion is not None: # Acquire a (60-second) write lock, leaseID = container.acquire_blob_lease(blob_name=str(jobStoreFileID), lease_duration=60) # check for modification, blob = container.get_blob_properties(blob_name=str(jobStoreFileID)) if blob.properties.etag != expectedVersion: container.release_blob_lease(blob_name=str(jobStoreFileID), lease_id=leaseID) raise ConcurrentFileModificationException(jobStoreFileID) # commit the file, container.put_block_list(blob_name=str(jobStoreFileID), block_list=blocks, lease_id=leaseID, metadata=dict(encrypted=str(encrypted))) # then release the lock. container.release_blob_lease(blob_name=str(jobStoreFileID), lease_id=leaseID) else: # No need to check for modification, just blindly write over whatever # was there. container.put_block_list(blob_name=str(jobStoreFileID), block_list=blocks, metadata=dict(encrypted=str(encrypted))) with UploadPipe() as writable: yield writable @contextmanager def _downloadStream(self, jobStoreFileID, container): # The reason this is not in the writer is so we catch non-existant blobs early blob = container.get_blob_properties(blob_name=str(jobStoreFileID)) encrypted = strict_bool(blob.metadata['encrypted']) if encrypted and self.keyPath is None: raise AssertionError('Content is encrypted but no key was provided.') outer_self = self class DownloadPipe(ReadablePipe): def writeTo(self, writable): chunkStart = 0 fileSize = blob.properties.content_length while chunkStart < fileSize: chunkEnd = chunkStart + outer_self._maxAzureBlockBytes - 1 buf = container.get_blob_to_bytes(blob_name=str(jobStoreFileID), start_range=chunkStart, end_range=chunkEnd).content if encrypted: buf = encryption.decrypt(buf, outer_self.keyPath) writable.write(buf) chunkStart = chunkEnd + 1 with DownloadPipe() as readable: yield readable