Example #1
0
class AzureStorageContainer(StorageContainer):
    """Azure implementation of Storage Container using BlockBlobService."""

    def __init__(self, container_name, config: AzureStorageConfig):
        self._blob_service = None
        self._container_name = container_name
        self._config = config

    def _get_client(self):
        """
        :return: BlockBlobService initialized with account
        name and key from config
        """
        if self._blob_service is None:
            self._blob_service = BlockBlobService(
                account_name=self._config.account_name,
                account_key=self._config.account_key
            )

            self._blob_service.create_container(self._container_name)

        return self._blob_service

    def upload_text(self, blob_name, text):
        """Uploads text to a new blob.

        :param blob_name: Name to give new blob
        :param text: Text to upload
        :return: None
        """
        self._get_client().create_blob_from_text(self._container_name,
                                                 blob_name, text)

    def list_blobs(self):
        """List all blobs in container.

        :return: List of blobs in container
        """
        return self._get_client().list_blobs(self._container_name)

    def get_blob_to_text(self, file_name):
        """Get string from contents of blob.

        :param file_name: Name of blob file
        :return: Text from blob file
        """
        return self._get_client().get_blob_to_text(self._container_name,
                                                   file_name)

    @staticmethod
    def create():
        """Initialize AzureStorageContainer with name and creds from config.

        :return:
        """
        return AzureStorageContainer(
            ProcessConfig().config_container_name,
            AzureConfig().storage_config)
    def block_blob_service(self):
        ACCOUNT_NAME = os.environ['ACCOUNT_NAME']
        ACCOUNT_KEY = os.environ['ACCOUNT_KEY']

        block_blob_service = BlockBlobService(account_name=ACCOUNT_NAME,
                                              account_key=ACCOUNT_KEY)

        block_blob_service.create_container(self.container_name)

        block_blob_service.set_container_acl(
            self.container_name, public_access=PublicAccess.Container)
        return block_blob_service
Example #3
0
class AzureStorageContainer(Common.Contracts.StorageContainer):

    def __init__(self, container_name, config: AzureStorageConfig):
        self._container_name = container_name
        self._blob_service = BlockBlobService(
            account_name=config.account_name,
            account_key=config.account_key
        )
        self._blob_service.create_container(self._container_name)

    def upload_text(self, blob_name, text):
        self._blob_service.create_blob_from_text(self._container_name, blob_name, text)

    def list_blobs(self):
        return self._blob_service.list_blobs(self._container_name)

    def get_blob_to_text(self, file_name):
        return self._blob_service.get_blob_to_text(self._container_name, file_name)
Example #4
0
class AzureJobStore(AbstractJobStore):
    """
    A job store that uses Azure's blob store for file storage and Table Service to store job info
    with strong consistency.
    """

    # Dots in container names should be avoided because container names are used in HTTPS bucket
    # URLs where the may interfere with the certificate common name. We use a double underscore
    # as a separator instead.
    #
    containerNameRe = re.compile(r'^[a-z0-9][a-z0-9-]+[a-z0-9]$')

    # See https://msdn.microsoft.com/en-us/library/azure/dd135715.aspx
    #
    minContainerNameLen = 3
    maxContainerNameLen = 63
    maxNameLen = 10
    nameSeparator = 'xx'  # Table names must be alphanumeric
    # Length of a jobID - used to test if a stats file has been read already or not
    jobIDLength = len(str(uuid.uuid4()))

    def __init__(self, locator, jobChunkSize=maxAzureTablePropertySize):
        super(AzureJobStore, self).__init__()
        accountName, namePrefix = locator.split(':', 1)
        if '--' in namePrefix:
            raise ValueError("Invalid name prefix '%s'. Name prefixes may not contain %s."
                             % (namePrefix, self.nameSeparator))
        if not self.containerNameRe.match(namePrefix):
            raise ValueError("Invalid name prefix '%s'. Name prefixes must contain only digits, "
                             "hyphens or lower-case letters and must not start or end in a "
                             "hyphen." % namePrefix)
        # Reserve 13 for separator and suffix
        if len(namePrefix) > self.maxContainerNameLen - self.maxNameLen - len(self.nameSeparator):
            raise ValueError(("Invalid name prefix '%s'. Name prefixes may not be longer than 50 "
                              "characters." % namePrefix))
        if '--' in namePrefix:
            raise ValueError("Invalid name prefix '%s'. Name prefixes may not contain "
                             "%s." % (namePrefix, self.nameSeparator))
        self.locator = locator
        self.jobChunkSize = jobChunkSize
        self.accountKey = _fetchAzureAccountKey(accountName)
        self.accountName = accountName
        # Table names have strict requirements in Azure
        self.namePrefix = self._sanitizeTableName(namePrefix)
        # These are the main API entry points.
        self.tableService = TableService(account_key=self.accountKey, account_name=accountName)
        self.blobService = BlockBlobService(account_key=self.accountKey, account_name=accountName)
        # Serialized jobs table
        self.jobItems = None
        # Job<->file mapping table
        self.jobFileIDs = None
        # Container for all shared and unshared files
        self.files = None
        # Stats and logging strings
        self.statsFiles = None
        # File IDs that contain stats and logging strings
        self.statsFileIDs = None

    @property
    def keyPath(self):
        return self.config.cseKey

    def initialize(self, config):
        if self._jobStoreExists():
            raise JobStoreExistsException(self.locator)
        logger.debug("Creating job store at '%s'" % self.locator)
        self._bind(create=True)
        super(AzureJobStore, self).initialize(config)

    def resume(self):
        if not self._jobStoreExists():
            raise NoSuchJobStoreException(self.locator)
        logger.debug("Using existing job store at '%s'" % self.locator)
        self._bind(create=False)
        super(AzureJobStore, self).resume()

    def destroy(self):
        self._bind()
        for name in 'jobItems', 'jobFileIDs', 'files', 'statsFiles', 'statsFileIDs':
            resource = getattr(self, name)
            if resource is not None:
                if isinstance(resource, AzureTable):
                    resource.delete_table()
                elif isinstance(resource, AzureBlobContainer):
                    resource.delete_container()
                else:
                    assert False
                setattr(self, name, None)

    def _jobStoreExists(self):
        """
        Checks if job store exists by querying the existence of the statsFileIDs table. Note that
        this is the last component that is deleted in :meth:`.destroy`.
        """
        for attempt in retry_azure():
            with attempt:
                try:
                    exists = self.tableService.exists(table_name=self._qualify('statsFileIDs'))
                except AzureMissingResourceHttpError as e:
                    if e.status_code == 404:
                        return False
                    else:
                        raise
                else:
                    return exists

    def _bind(self, create=False):
        table = self._bindTable
        container = self._bindContainer
        for name, binder in (('jobItems', table),
                             ('jobFileIDs', table),
                             ('files', container),
                             ('statsFiles', container),
                             ('statsFileIDs', table)):
            if getattr(self, name) is None:
                setattr(self, name, binder(self._qualify(name), create=create))

    def _qualify(self, name):
        return self.namePrefix + self.nameSeparator + name.lower()

    def jobs(self):

        # How many jobs have we done?
        total_processed = 0

        for jobEntity in self.jobItems.query_entities():
            # Process the items in the page
            yield AzureJob.fromEntity(jobEntity)
            total_processed += 1

            if total_processed % 1000 == 0:
                # Produce some feedback for the user, because this can take
                # a long time on, for example, Azure
                logger.debug("Processed %d total jobs" % total_processed)

        logger.debug("Processed %d total jobs" % total_processed)

    def create(self, jobNode):
        jobStoreID = self._newJobID()
        job = AzureJob.fromJobNode(jobNode, jobStoreID, self._defaultTryCount())
        entity = job.toEntity(chunkSize=self.jobChunkSize)
        self.jobItems.insert_entity(entity=entity)
        return job

    def exists(self, jobStoreID):
        if self.jobItems.get_entity(row_key=str(jobStoreID)) is None:
            return False
        return True

    def load(self, jobStoreID):
        jobEntity = self.jobItems.get_entity(row_key=str(jobStoreID))
        if jobEntity is None:
            raise NoSuchJobException(jobStoreID)
        return AzureJob.fromEntity(jobEntity)

    def update(self, job):
        self.jobItems.update_entity(entity=job.toEntity(chunkSize=self.jobChunkSize))

    def delete(self, jobStoreID):
        try:
            self.jobItems.delete_entity(row_key=str(jobStoreID))
        except AzureMissingResourceHttpError:
            # Job deletion is idempotent, and this job has been deleted already
            return
        filterString = "PartitionKey eq '%s'" % jobStoreID
        for fileEntity in self.jobFileIDs.query_entities(filter=filterString):
            jobStoreFileID = fileEntity.RowKey
            self.deleteFile(jobStoreFileID)

    def getEnv(self):
        return dict(AZURE_ACCOUNT_KEY=self.accountKey)

    class BlobInfo(namedtuple('BlobInfo', ('account', 'container', 'name'))):
        @property
        @memoize
        def service(self):
            return BlockBlobService(account_name=self.account,
                                    account_key=_fetchAzureAccountKey(self.account))

    @classmethod
    def getSize(cls, url):
        blob = cls._parseWasbUrl(url)
        blob = blob.service.get_blob_properties(blob.container, blob.name)
        return blob.properties.content_length

    @classmethod
    def _readFromUrl(cls, url, writable):
        blob = cls._parseWasbUrl(url)
        for attempt in retry_azure():
            with attempt:
                blob.service.get_blob_to_stream(container_name=blob.container,
                                                blob_name=blob.name,
                                                stream=writable)

    @classmethod
    def _writeToUrl(cls, readable, url):
        blob = cls._parseWasbUrl(url)
        blob.service.create_blob_from_stream(container_name=blob.container,
                                             blob_name=blob.name,
                                             max_connections=1,
                                             stream=readable)

    @classmethod
    def _parseWasbUrl(cls, url):
        """
        :param urlparse.ParseResult url: x
        :rtype: AzureJobStore.BlobInfo
        """
        assert url.scheme in ('wasb', 'wasbs')
        try:
            container, account = url.netloc.split('@')
        except ValueError:
            raise InvalidImportExportUrlException(url)
        suffix = '.blob.core.windows.net'
        if account.endswith(suffix):
            account = account[:-len(suffix)]
        else:
            raise InvalidImportExportUrlException(url)
        assert url.path[0] == '/'
        return cls.BlobInfo(account=account, container=container, name=url.path[1:])

    @classmethod
    def _supportsUrl(cls, url, export=False):
        return url.scheme.lower() in ('wasb', 'wasbs')

    def writeFile(self, localFilePath, jobStoreID=None):
        jobStoreFileID = self._newFileID()
        self.updateFile(jobStoreFileID, localFilePath)
        self._associateFileWithJob(jobStoreFileID, jobStoreID)
        return jobStoreFileID

    def updateFile(self, jobStoreFileID, localFilePath):
        with open(localFilePath, 'rb') as read_fd:
            with self._uploadStream(jobStoreFileID, self.files) as write_fd:
                while True:
                    buf = read_fd.read(self._maxAzureBlockBytes)
                    write_fd.write(buf)
                    if len(buf) == 0:
                        break

    def readFile(self, jobStoreFileID, localFilePath, symlink=False):
        try:
            with self._downloadStream(jobStoreFileID, self.files) as read_fd:
                with open(localFilePath, 'wb') as write_fd:
                    while True:
                        buf = read_fd.read(self._maxAzureBlockBytes)
                        write_fd.write(buf)
                        if not buf:
                            break
        except AzureMissingResourceHttpError:
            raise NoSuchFileException(jobStoreFileID)

    def deleteFile(self, jobStoreFileID):
        try:
            self.files.delete_blob(blob_name=str(jobStoreFileID))
            self._dissociateFileFromJob(jobStoreFileID)
        except AzureMissingResourceHttpError:
            pass

    def fileExists(self, jobStoreFileID):
        # As Azure doesn't have a blob_exists method (at least in the
        # python API) we just try to download the metadata, and hope
        # the metadata is small so the call will be fast.
        try:
            self.files.get_blob_metadata(blob_name=str(jobStoreFileID))
            return True
        except AzureMissingResourceHttpError:
            return False

    @contextmanager
    def writeFileStream(self, jobStoreID=None):
        # TODO: this (and all stream methods) should probably use the
        # Append Blob type, but that is not currently supported by the
        # Azure Python API.
        jobStoreFileID = self._newFileID()
        with self._uploadStream(jobStoreFileID, self.files) as fd:
            yield fd, jobStoreFileID
        self._associateFileWithJob(jobStoreFileID, jobStoreID)

    @contextmanager
    def updateFileStream(self, jobStoreFileID):
        with self._uploadStream(jobStoreFileID, self.files, checkForModification=True) as fd:
            yield fd

    def getEmptyFileStoreID(self, jobStoreID=None):
        jobStoreFileID = self._newFileID()
        with self._uploadStream(jobStoreFileID, self.files) as _:
            pass
        self._associateFileWithJob(jobStoreFileID, jobStoreID)
        return jobStoreFileID

    @contextmanager
    def readFileStream(self, jobStoreFileID):
        if not self.fileExists(jobStoreFileID):
            raise NoSuchFileException(jobStoreFileID)
        with self._downloadStream(jobStoreFileID, self.files) as fd:
            yield fd

    @contextmanager
    def writeSharedFileStream(self, sharedFileName, isProtected=None):
        assert self._validateSharedFileName(sharedFileName)
        sharedFileID = self._newFileID(sharedFileName)
        with self._uploadStream(sharedFileID, self.files, encrypted=isProtected) as fd:
            yield fd

    @contextmanager
    def readSharedFileStream(self, sharedFileName):
        assert self._validateSharedFileName(sharedFileName)
        sharedFileID = self._newFileID(sharedFileName)
        if not self.fileExists(sharedFileID):
            raise NoSuchFileException(sharedFileID)
        with self._downloadStream(sharedFileID, self.files) as fd:
            yield fd

    def writeStatsAndLogging(self, statsAndLoggingString):
        # TODO: would be a great use case for the append blobs, once implemented in the Azure SDK
        jobStoreFileID = self._newFileID()
        encrypted = self.keyPath is not None
        if encrypted:
            statsAndLoggingString = encryption.encrypt(statsAndLoggingString, self.keyPath)
        self.statsFiles.create_blob_from_text(blob_name=str(jobStoreFileID),
                                              text=statsAndLoggingString,
                                              metadata=dict(encrypted=str(encrypted)))
        self.statsFileIDs.insert_entity(entity={'RowKey': jobStoreFileID})

    def readStatsAndLogging(self, callback, readAll=False):
        suffix = '_old'
        numStatsFiles = 0
        for attempt in retry_azure():
            with attempt:
                for entity in self.statsFileIDs.query_entities():
                    jobStoreFileID = entity.RowKey
                    hasBeenRead = len(jobStoreFileID) > self.jobIDLength
                    if not hasBeenRead:
                        with self._downloadStream(jobStoreFileID, self.statsFiles) as fd:
                            callback(fd)
                        # Mark this entity as read by appending the suffix
                        self.statsFileIDs.insert_entity(entity={'RowKey': jobStoreFileID + suffix})
                        self.statsFileIDs.delete_entity(row_key=str(jobStoreFileID))
                        numStatsFiles += 1
                    elif readAll:
                        # Strip the suffix to get the original ID
                        jobStoreFileID = jobStoreFileID[:-len(suffix)]
                        with self._downloadStream(jobStoreFileID, self.statsFiles) as fd:
                            callback(fd)
                        numStatsFiles += 1
        return numStatsFiles

    _azureTimeFormat = "%Y-%m-%dT%H:%M:%SZ"

    def getPublicUrl(self, jobStoreFileID):
        try:
            self.files.get_blob_properties(blob_name=str(jobStoreFileID))
        except AzureMissingResourceHttpError:
            raise NoSuchFileException(jobStoreFileID)
        startTime = (datetime.utcnow() - timedelta(minutes=5))
        endTime = datetime.utcnow() + self.publicUrlExpiration
        sas_token = self.files.generate_blob_shared_access_signature(blob_name=str(jobStoreFileID),
                                                                     permission=BlobPermissions.READ,
                                                                     start=startTime,
                                                                     expiry=endTime)
        return self.files.make_blob_url(blob_name=str(jobStoreFileID)) + '?' + sas_token

    def getSharedPublicUrl(self, sharedFileName):
        jobStoreFileID = self._newFileID(sharedFileName)
        return self.getPublicUrl(jobStoreFileID)

    def _newJobID(self):
        # raw UUIDs don't work for Azure property names because the '-' character is disallowed.
        return str(uuid.uuid4()).replace('-', '_')

    # A dummy job ID under which all shared files are stored.
    sharedFileJobID = uuid.UUID('891f7db6-e4d9-4221-a58e-ab6cc4395f94')

    def _newFileID(self, sharedFileName=None):
        if sharedFileName is None:
            ret = str(uuid.uuid4())
        else:
            ret = str(uuid.uuid5(self.sharedFileJobID, sharedFileName))
        return ret.replace('-', '_')

    def _associateFileWithJob(self, jobStoreFileID, jobStoreID=None):
        if jobStoreID is not None:
            self.jobFileIDs.insert_entity(entity={'PartitionKey': EntityProperty('Edm.String', jobStoreID),
                                                  'RowKey': EntityProperty('Edm.String', jobStoreFileID)})

    def _dissociateFileFromJob(self, jobStoreFileID):
        entities = list(self.jobFileIDs.query_entities(filter="RowKey eq '%s'" % jobStoreFileID))
        if entities:
            assert len(entities) == 1
            jobStoreID = entities[0].PartitionKey
            self.jobFileIDs.delete_entity(partition_key=str(jobStoreID), row_key=str(jobStoreFileID))

    def _bindTable(self, tableName, create=False):
        for attempt in retry_azure():
            with attempt:
                try:
                    exists = self.tableService.exists(table_name=tableName)
                except AzureMissingResourceHttpError as e:
                    if e.status_code != 404:
                        raise
                else:
                    if exists:
                        return AzureTable(self.tableService, tableName)
                if create:
                    self.tableService.create_table(tableName)
                    return AzureTable(self.tableService, tableName)
                else:
                    return None

    def _bindContainer(self, containerName, create=False):
        for attempt in retry_azure():
            with attempt:
                try:
                    self.blobService.get_container_properties(containerName)
                except AzureMissingResourceHttpError as e:
                    if e.status_code == 404:
                        if create:
                            self.blobService.create_container(containerName)
                        else:
                            return None
                    else:
                        raise
        return AzureBlobContainer(self.blobService, containerName)

    def _sanitizeTableName(self, tableName):
        """
        Azure table names must start with a letter and be alphanumeric.

        This will never cause a collision if uuids are used, but
        otherwise may not be safe.
        """
        return 'a' + ''.join([x for x in tableName if x.isalnum()])

    # Maximum bytes that can be in any block of an Azure block blob
    # https://github.com/Azure/azure-storage-python/blob/4c7666e05a9556c10154508335738ee44d7cb104/azure/storage/blob/blobservice.py#L106
    _maxAzureBlockBytes = 4 * 1024 * 1024

    @contextmanager
    def _uploadStream(self, jobStoreFileID, container, checkForModification=False, encrypted=None):
        """
        :param encrypted: True to enforce encryption (will raise exception unless key is set),
        False to prevent encryption or None to encrypt if key is set.
        """
        if checkForModification:
            try:
                expectedVersion = container.get_blob_properties(blob_name=str(jobStoreFileID)).properties.etag
            except AzureMissingResourceHttpError:
                expectedVersion = None

        if encrypted is None:
            encrypted = self.keyPath is not None
        elif encrypted:
            if self.keyPath is None:
                raise RuntimeError('Encryption requested but no key was provided')

        maxBlockSize = self._maxAzureBlockBytes
        if encrypted:
            # There is a small overhead for encrypted data.
            maxBlockSize -= encryption.overhead

        store = self

        class UploadPipe(WritablePipe):

            def readFrom(self, readable):
                blocks = []
                try:
                    while True:
                        buf = readable.read(maxBlockSize)
                        if len(buf) == 0:
                            # We're safe to break here even if we never read anything, since
                            # putting an empty block list creates an empty blob.
                            break
                        if encrypted:
                            buf = encryption.encrypt(buf, store.keyPath)
                        blockID = store._newFileID()
                        container.put_block(blob_name=str(jobStoreFileID),
                                            block=buf,
                                            block_id=blockID)
                        blocks.append(BlobBlock(blockID))
                except:
                    with panic(log=logger):
                        # This is guaranteed to delete any uncommitted blocks.
                        container.delete_blob(blob_name=str(jobStoreFileID))

                if checkForModification and expectedVersion is not None:
                    # Acquire a (60-second) write lock,
                    leaseID = container.acquire_blob_lease(blob_name=str(jobStoreFileID),
                                                           lease_duration=60)
                    # check for modification,
                    blob = container.get_blob_properties(blob_name=str(jobStoreFileID))
                    if blob.properties.etag != expectedVersion:
                        container.release_blob_lease(blob_name=str(jobStoreFileID), lease_id=leaseID)
                        raise ConcurrentFileModificationException(jobStoreFileID)
                    # commit the file,
                    container.put_block_list(blob_name=str(jobStoreFileID),
                                             block_list=blocks,
                                             lease_id=leaseID,
                                             metadata=dict(encrypted=str(encrypted)))
                    # then release the lock.
                    container.release_blob_lease(blob_name=str(jobStoreFileID), lease_id=leaseID)
                else:
                    # No need to check for modification, just blindly write over whatever
                    # was there.
                    container.put_block_list(blob_name=str(jobStoreFileID),
                                             block_list=blocks,
                                             metadata=dict(encrypted=str(encrypted)))

        with UploadPipe() as writable:
            yield writable

    @contextmanager
    def _downloadStream(self, jobStoreFileID, container):
        # The reason this is not in the writer is so we catch non-existant blobs early

        blob = container.get_blob_properties(blob_name=str(jobStoreFileID))

        encrypted = strict_bool(blob.metadata['encrypted'])
        if encrypted and self.keyPath is None:
            raise AssertionError('Content is encrypted but no key was provided.')

        outer_self = self

        class DownloadPipe(ReadablePipe):
            def writeTo(self, writable):
                chunkStart = 0
                fileSize = blob.properties.content_length
                while chunkStart < fileSize:
                    chunkEnd = chunkStart + outer_self._maxAzureBlockBytes - 1
                    buf = container.get_blob_to_bytes(blob_name=str(jobStoreFileID),
                                                      start_range=chunkStart,
                                                      end_range=chunkEnd).content
                    if encrypted:
                        buf = encryption.decrypt(buf, outer_self.keyPath)
                    writable.write(buf)
                    chunkStart = chunkEnd + 1

        with DownloadPipe() as readable:
            yield readable