class _BlobStorageTestCase(_TestCase): def _get_container_name(self, handler_name): container = _get_handler_config_value(handler_name, 'container') if container: container = container.replace('_', '-').lower() return container def setUp(self): self.service = BlobService(ACCOUNT_NAME, ACCOUNT_KEY) # ensure that there's no log file in the container before each test containers = [c.name for c in self.service.list_containers()] for handler in LOGGING['handlers']: container = self._get_container_name(handler) if container in containers: filename = _get_handler_config_value(handler, 'filename') basename = os.path.basename(filename) for blob in self.service.list_blobs(container, prefix=basename): self.service.delete_blob(container, blob.name)
class _BlobStorageTestCase(_TestCase): def _get_container_name(self, handler_name): container = _get_handler_config_value(handler_name, 'container') if container: container = container.replace('_', '-').lower() return container def setUp(self): self.service = BlobService(ACCOUNT_NAME, ACCOUNT_KEY) # ensure that there's no log file in the container before each test containers = [c.name for c in self.service.list_containers()] for handler in LOGGING['handlers']: container = self._get_container_name(handler) if container in containers: filename = _get_handler_config_value(handler, 'filename') basename = os.path.basename(filename) for blob in self.service.list_blobs(container, prefix=basename): self.service.delete_blob(container, blob.name)
def module_impl(rm, log, params, check_mode=False): if not HAS_AZURE: raise Exception("The Azure python sdk is not installed (try 'pip install azure')") if not HAS_REQUESTS: raise Exception("The requests python module is not installed (try 'pip install requests')") resource_group = params.get('resource_group') account_name = params.get('account_name') container_name = params.get('container_name') mode = params.get('mode') x_ms_meta_name_values = params.get('x_ms_meta_name_values') x_ms_blob_public_access = params.get('x_ms_blob_public_access') x_ms_blob_cache_control = params.get('x_ms_blob_cache_control') x_ms_blob_content_encoding = params.get('x_ms_blob_content_encoding') x_ms_blob_content_language = params.get('x_ms_blob_content_language') x_ms_blob_content_type = params.get('x_ms_blob_content_type') prefix = params.get('prefix') marker = params.get('marker') max_results = params.get('max_results') blob_name = params.get('blob_name') file_path = params.get('file_path') overwrite = params.get('overwrite') permissions = params.get('permissions') hours = params.get('hours') days = params.get('days') access_token = params.get('access_token') results = dict(changed=False) storage_client = rm.storage_client if not resource_group: raise Exception("Parameter error: resource_group cannot be None.") if not account_name: raise Exception("Parameter error: account_name cannot be None.") if not container_name: raise Exception("Parameter error: container_name cannot be None.") if not NAME_PATTERN.match(container_name): raise Exception("Parameter error: container_name must consist of lowercase letters, numbers and hyphens. It must begin with " + "a letter or number. It may not contain two consecutive hyphens.") # add file path validation results['account_name'] = account_name results['resource_group'] = resource_group results['container_name'] = container_name # put (upload), get (download), geturl (return download url (Ansible 1.3+), getstr (download object as string (1.3+)), list (list keys (2.0+)), create (bucket), delete (bucket), and delobj (delete object) try: log('Getting keys') keys = {} response = storage_client.storage_accounts.list_keys(resource_group, account_name) keys[KeyName.key1] = response.storage_account_keys.key1 keys[KeyName.key2] = response.storage_account_keys.key2 except AzureHttpError as e: log('Error getting keys for account %s' % account_name) raise Exception(str(e.message)) try: log('Create blob service') bs = BlobService(account_name, keys[KeyName.key1]) except Exception as e: log('Error creating blob service.') raise Exception(str(e.args[0])) if mode == 'create': container = get_container_facts(bs, container_name) if container is not None: # container exists results['container'] = container results['msg'] = "Container already exists." return results # create the container if not check_mode: log('Create container %s' % container_name) bs.create_container(container_name, x_ms_meta_name_values, x_ms_blob_public_access) results['container'] = get_container_facts(bs, container_name) results['msg'] = "Container created successfully." results['changed'] = True return results if mode == 'update': container = get_container_facts(bs, container_name) if container is None: # container does not exist if not check_mode: log('Create container %s' % container_name) bs.create_container(container_name, x_ms_meta_name_values, x_ms_blob_public_access) results['changed'] = True results['msg'] = 'Container created successfully.' return results # update existing container results['msg'] = "Container not changed." if x_ms_meta_name_values: if not check_mode: log('Update x_ms_meta_name_values for container %s' % container_name) bs.set_container_metadata(container_name, x_ms_meta_name_values) results['changed'] = True results['msg'] = 'Container meta data updated successfully.' if x_ms_blob_public_access: access = x_ms_blob_public_access if x_ms_blob_public_access == 'private': access = None if not check_mode: log('Set access to %s for container %s' % (access, container_name)) bs.set_container_acl(container_name=container_name, x_ms_blob_public_access=access) results['changed'] = True results['msg'] = 'Container ACL updated successfully.' if permissions: if hours == 0 and days == 0: raise Exception("Parameter error: expecting hours > 0 or days > 0") id = "%s-%s" % (container_name, permissions) si = get_identifier(id, hours, days, permissions) identifiers = SignedIdentifiers() identifiers.signed_identifiers.append(si) if not check_mode: log('Set permissions to %s for container %s' % (permissions, container_name)) bs.set_container_acl(container_name=container_name,signed_identifiers=identifiers) results['changed'] = True results['msg'] = 'Container ACL updated successfully.' results['container'] = get_container_facts(bs, container_name) return results if mode == 'delete': container = get_container_facts(bs, container_name) if container is None: results['msg'] = "Container %s could not be found." % container_name return results if not check_mode: log('Deleting container %s' % container_name) bs.delete_container(container_name) results['changed'] = True results['msg'] = 'Container deleted successfully.' return results if mode == 'delete_blob': if blob_name is None: raise Exception("Parameter error: blob_name cannot be None.") container = container_check(bs, container_name) blob = get_blob_facts(bs, container_name, blob_name) if not blob: results['msg'] = 'Blob %s could not be found in container %s.' % (blob_name, container_name) return results if not check_mode: log('Deleteing %s from container %s.' % (blob_name, container_name)) bs.delete_blob(container_name, blob_name) results['changed'] = True results['msg'] = 'Blob successfully deleted.' return results if mode == 'put': if not blob_name: raise Exception("Parameter error: blob_name cannot be None.") if not file_path : raise Exception("Parameter error: file_path cannot be None.") if not path_check(file_path): raise Exception("File %s does not exist." % file_path) container = get_container_facts(bs, container_name) blob = None if container is not None: blob = get_blob_facts(bs, container_name, blob_name) if container is not None and blob is not None: # both container and blob already exist md5_remote = blob['content-md5'] md5_local = get_md5(file_path) results['container'] = container results['blob'] = blob if md5_local == md5_remote: sum_matches = True results['msg'] = 'File checksums match. File not uploaded.' if overwrite == 'always': if not check_mode: log('Uploading %s to container %s.' % (file_path, container_name)) put_block_blob( bs, container_name, blob_name, file_path, x_ms_meta_name_values, x_ms_blob_cache_control, x_ms_blob_content_encoding, x_ms_blob_content_language, x_ms_blob_content_type ) results['blob'] = get_blob_facts(bs, container_name, blob_name) results['changed'] = True results['msg'] = 'File successfully uploaded.' else: sum_matches = False if overwrite in ('always', 'different'): if not check_mode: log('Uploading %s to container %s.' % (file_path, container_name)) put_block_blob( bs, container_name, blob_name, file_path, x_ms_meta_name_values, x_ms_blob_cache_control, x_ms_blob_content_encoding, x_ms_blob_content_language, x_ms_blob_content_type ) results['blob'] = get_blob_facts(bs, container_name, blob_name) results['changed'] = True results['msg'] = 'File successfully uploaded.' else: results['msg'] = "WARNING: Checksums do not match. Use overwrite parameter to force upload." return results if container is None: # container does not exist. create container and upload. if not check_mode: log('Creating container %s.' % container_name) bs.create_container(container_name, x_ms_meta_name_values, x_ms_blob_public_access) log('Uploading %s to container %s.' % (file_path, container_name)) put_block_blob( bs, container_name, blob_name, file_path, x_ms_meta_name_values, x_ms_blob_cache_control, x_ms_blob_content_encoding, x_ms_blob_content_language, x_ms_blob_content_type ) results['conainer'] = get_container_facts(bs, container_name) results['blob'] = get_blob_facts(bs, container_name, blob_name) results['changed'] = True results['msg'] = 'Successfully created container and uploaded file.' return results if container is not None: # container exists. just upload. if not check_mode: log('Uploading %s to container %s.' % (file_path, container_name)) put_block_blob( bs, container_name, blob_name, file_path, x_ms_meta_name_values, x_ms_blob_cache_control, x_ms_blob_content_encoding, x_ms_blob_content_language, x_ms_blob_content_type ) results['blob'] = get_blob_facts(bs, container_name, blob_name) results['changed'] = True results['msg'] = 'Successfully updloaded file.' return results if mode == 'list': container = container_check(bs, container_name) response = bs.list_blobs( container_name, prefix, marker, max_results ) results['blobs'] = [] for blob in response.blobs: b = dict( name = blob.name, snapshot = blob.snapshot, last_modified = blob.properties.last_modified, content_length = blob.properties.content_length, blob_type = blob.properties.blob_type, ) results['blobs'].append(b) return results if mode == 'get': if file_path is None: raise Exception("Parameter error: file_path cannot be None.") container = container_check(bs, container_name) blob = blob_check(bs, container_name, blob_name) path_exists = path_check(file_path) if not path_exists or overwrite == 'always': if not check_mode: bs.get_blob_to_path(container_name, blob_name, file_path) results['changed'] = True results['msg'] = "Blob %s successfully downloaded to %s." % (blob_name, file_path) return results if path_exists: md5_remote = blob['content-md5'] md5_local = get_md5(file_path) if md5_local == md5_remote: sum_matches = True if overwrite == 'always': if not check_mode: bs.get_blob_to_path(container_name, blob_name, file_path) results['changed'] = True results['msg'] = "Blob %s successfully downloaded to %s." % (blob_name, file_path) else: results['msg'] = "Local and remote object are identical, ignoring. Use overwrite parameter to force." else: sum_matches = False if overwrite in ('always', 'different'): if not check_mode: bs.get_blob_to_path(container_name, blob_name, file_path) results['changed'] = True results['msg'] = "Blob %s successfully downloaded to %s." % (blob_name, file_path) else: results['msg'] ="WARNING: Checksums do not match. Use overwrite parameter to force download." if sum_matches is True and overwrite == 'never': results['msg'] = "Local and remote object are identical, ignoring. Use overwrite parameter to force." return results if mode == 'get_url': if not blob_name: raise Exception("Parameter error: blob_name cannot be None.") container = container_check(bs, container_name) blob = blob_check(bs, container_name, blob_name) url = bs.make_blob_url( container_name=container_name, blob_name=blob_name, sas_token=access_token) results['url'] = url results['msg'] = "Url: %s" % url return results if mode == 'get_token': if hours == 0 and days == 0: raise Exception("Parameter error: expecting hours > 0 or days > 0") container = container_check(bs, container_name) blob = blob_check(bs, container_name, blob_name) results['blob_name'] = blob_name sap = get_shared_access_policy(permissions, hours=hours, days=days) token = bs.generate_shared_access_signature(container_name, blob_name, sap) results['access_token'] = token return results
blob_service = BlobService(account_name="<account_name>", account_key="<account_key>") blob_service.create_container("datacontainer") blob_service.create_container("datacontainer", x_ms_blob_public_access="container") blob_service.set_container_acl("datacontainer", x_ms_blob_public_access="container") blob_service.put_block_blob_from_path( "datacontainer", "datablob", "StorageClientPy.py", x_ms_blob_content_type="text/x-script.phyton" ) blobs = [] marker = None while True: batch = blob_service.list_blobs("datacontainer", marker=marker) blobs.extend(batch) if not batch.next_marker: break marker = batch.next_marker for blob in blobs: print(blob.name) blob_service.get_blob_to_path("datacontainer", "datablob", "out-StorageClientPy.py") blob_service.delete_blob("datacontainer", "datablob")
def clearAll(): blob_service = BlobService(account_name, account_key) blob_list = getBlobList(blob_service) print blob_list for blob in blob_list: blob_service.delete_blob(container_name, blob)
def delete_object(container, name): blob_service = BlobService(AZURE_ACCOUNT_NAME, AZURE_ACCOUNT_KEY) blob_service.delete_blob(container, name)
move(fullFilePath, stagingDir) # the super secret location accountKeyFile = open(azureKeyLocation, 'r') accountKey = accountKeyFile.read().strip() accountKeyFile.close() # Get a handle on the Azure Blob Storage account azureStorage = BlobService(account_name=azureAccount, account_key=accountKey) checksumFilename = md5FullFilePath + ".md5" # Ensure a clean slate for pushing the new data set try: azureStorage.delete_blob(ingestContainer, filename) theLog.write("Existing ingest data blob found, deleting it\n\n") theLog.flush() if not isClaims: azureStorage.delete_blob(ingestContainer, filename.split(".")[0] + ".md5") theLog.write("Existing ingest checksum blob found, deleting it\n\n") theLog.flush() except AzureMissingResourceHttpError: pass # Try to put the blob out in the wild, provide MD5 for error # checking since M$ didn't feel the need to implement a return # code for this function # On further testing, the "content_md5" is only for header rather
class AzureFS(LoggingMixIn, Operations): """Azure Blob Storage filesystem""" blobs = None containers = dict() # <cname, dict(stat:dict, #files:None|dict<fname, stat>) fds = dict() # <fd, (path, bytes, dirty)> fd = 0 def __init__(self, account, key): self.blobs = BlobService(account, key) self.rebuild_container_list() def convert_to_epoch(self, date): """Converts Tue, 31 Jul 2012 07:17:34 GMT format to epoch""" return int(time.mktime(time.strptime(date, TIME_FORMAT))) def rebuild_container_list(self): cmap = dict() cnames = set() for c in self.blobs.list_containers(): date = c.properties.last_modified cstat = dict(st_mode=(S_IFDIR | 0755), st_uid=getuid(), st_size=0, st_mtime=self.convert_to_epoch(date)) cname = c.name cmap['/' + cname] = dict(stat=cstat, files=None) cnames.add(cname) cmap['/'] = dict(files={}, stat=dict(st_mode=(S_IFDIR | 0755), st_uid=getuid(), st_size=0, st_mtime=int(time.time()))) self.containers = cmap # destroys fs tree cache resistant to misses def _parse_path(self, path): # returns </dir, file(=None)> if path.count('/') > 1: # file return str(path[:path.rfind('/')]), str(path[path.rfind('/') + 1:]) else: # dir pos = path.rfind('/', 1) if pos == -1: return path, None else: return str(path[:pos]), None def parse_container(self, path): base_container = path[1:] # /abc/def/g --> abc if base_container.find('/') > -1: base_container = base_container[:base_container.find('/')] return str(base_container) def _get_dir(self, path, contents_required=False): if not self.containers: self.rebuild_container_list() if path in self.containers and not (contents_required and \ self.containers[path]['files'] is None): return self.containers[path] cname = self.parse_container(path) if '/' + cname not in self.containers: raise FuseOSError(ENOENT) else: if self.containers['/' + cname]['files'] is None: # fetch contents of container log.info("------> CONTENTS NOT FOUND: %s" % cname) blobs = self.blobs.list_blobs(cname) dirstat = dict(st_mode=(S_IFDIR | 0755), st_size=0, st_uid=getuid(), st_mtime=time.time()) if self.containers['/' + cname]['files'] is None: self.containers['/' + cname]['files'] = dict() for f in blobs: blob_name = f.name blob_date = f.properties.last_modified blob_size = long(f.properties.content_length) node = dict(st_mode=(S_IFREG | 0644), st_size=blob_size, st_mtime=self.convert_to_epoch(blob_date), st_uid=getuid()) if blob_name.find('/') == -1: # file just under container self.containers['/' + cname]['files'][blob_name] = node return self.containers['/' + cname] return None def _get_file(self, path): d, f = self._parse_path(path) dir = self._get_dir(d, True) if dir is not None and f in dir['files']: return dir['files'][f] def getattr(self, path, fh=None): d, f = self._parse_path(path) if f is None: dir = self._get_dir(d) return dir['stat'] else: file = self._get_file(path) if file: return file raise FuseOSError(ENOENT) # FUSE def mkdir(self, path, mode): if path.count('/') <= 1: # create on root name = path[1:] if not 3 <= len(name) <= 63: log.error("Container names can be 3 through 63 chars long.") raise FuseOSError(ENAMETOOLONG) if name is not name.lower(): log.error("Container names cannot contain uppercase \ characters.") raise FuseOSError(EACCES) if name.count('--') > 0: log.error('Container names cannot contain consecutive \ dashes (-).') raise FuseOSError(EAGAIN) #TODO handle all "-"s must be preceded by letter or numbers #TODO starts with only letter or number, can contain letter, nr,'-' resp = self.blobs.create_container(name) if resp: self.rebuild_container_list() log.info("CONTAINER %s CREATED" % name) else: raise FuseOSError(EACCES) log.error("Invalid container name or container already \ exists.") else: raise FuseOSError(ENOSYS) # TODO support 2nd+ level mkdirs def rmdir(self, path): if path.count('/') == 1: c_name = path[1:] resp = self.blobs.delete_container(c_name) if resp: if path in self.containers: del self.containers[path] else: raise FuseOSError(EACCES) else: raise FuseOSError(ENOSYS) # TODO support 2nd+ level mkdirs def create(self, path, mode): node = dict(st_mode=(S_IFREG | mode), st_size=0, st_nlink=1, st_uid=getuid(), st_mtime=time.time()) d, f = self._parse_path(path) if not f: log.error("Cannot create files on root level: /") raise FuseOSError(ENOSYS) dir = self._get_dir(d, True) if not dir: raise FuseOSError(EIO) dir['files'][f] = node return self.open(path, data='') # reusing handler provider def open(self, path, flags=0, data=None): if data == None: # download contents c_name = self.parse_container(path) f_name = path[path.find('/', 1) + 1:] try: data = self.blobs.get_blob(c_name, f_name) except AzureMissingResourceHttpError: dir = self._get_dir('/' + c_name, True) if f_name in dir['files']: del dir['files'][f_name] raise FuseOSError(ENOENT) except AzureException as e: log.error("Read blob failed HTTP %d" % e.code) raise FuseOSError(EAGAIN) self.fd += 1 self.fds[self.fd] = (path, data, False) return self.fd def flush(self, path, fh=None): if not fh: raise FuseOSError(EIO) else: if fh not in self.fds: raise FuseOSError(EIO) path = self.fds[fh][0] data = self.fds[fh][1] dirty = self.fds[fh][2] if not dirty: return 0 # avoid redundant write d, f = self._parse_path(path) c_name = self.parse_container(path) if data is None: data = '' try: if len(data) < 64 * 1024 * 1024: # 64 mb self.blobs.put_blob(c_name, f, data, 'BlockBlob') else: # divide file by blocks and upload block_size = 8 * 1024 * 1024 num_blocks = int(math.ceil(len(data) * 1.0 / block_size)) rd = str(random.randint(1, 1e8)) block_ids = list() for i in range(num_blocks): part = data[i * block_size:min((i + 1) * block_size, len(data))] block_id = base64.encodestring('%s_%s' % (rd, (8 - len(str(i))) * '0' + str(i))) self.blobs.put_block(c_name, f, part, block_id) block_ids.append(block_id) self.blobs.put_block_list(c_name, f, block_ids) except AzureException: raise FuseOSError(EAGAIN) dir = self._get_dir(d, True) if not dir or f not in dir['files']: raise FuseOSError(EIO) # update local data dir['files'][f]['st_size'] = len(data) dir['files'][f]['st_mtime'] = time.time() self.fds[fh] = (path, data, False) # mark as not dirty return 0 def release(self, path, fh=None): if fh is not None and fh in self.fds: del self.fds[fh] def truncate(self, path, length, fh=None): return 0 # assume done, no need def write(self, path, data, offset, fh=None): if not fh or fh not in self.fds: raise FuseOSError(ENOENT) else: d = self.fds[fh][1] if d is None: d = "" self.fds[fh] = (self.fds[fh][0], d[:offset] + data, True) return len(data) def unlink(self, path): c_name = self.parse_container(path) d, f = self._parse_path(path) try: self.blobs.delete_blob(c_name, f) _dir = self._get_dir(path, True) if _dir and f in _dir['files']: del _dir['files'][f] return 0 except AzureMissingResourceHttpError: raise FuseOSError(ENOENT) except Exception as e: raise FuseOSError(EAGAIN) def readdir(self, path, fh): if path == '/': return ['.', '..'] + [x[1:] for x in self.containers.keys() \ if x is not '/'] dir = self._get_dir(path, True) if not dir: raise FuseOSError(ENOENT) return ['.', '..'] + dir['files'].keys() def read(self, path, size, offset, fh): if not fh or fh not in self.fds: raise FuseOSError(ENOENT) f_name = path[path.find('/', 1) + 1:] c_name = path[1:path.find('/', 1)] try: data = self.blobs.get_blob(c_name, f_name) self.fds[fh] = (self.fds[fh][0], data, False) return data[offset:offset + size] except URLError, e: if e.code == 404: raise FuseOSError(ENOENT) elif e.code == 403: raise FUSEOSError(EPERM) else: log.error("Read blob failed HTTP %d" % e.code) raise FuseOSError(EAGAIN) data = self.fds[fh][1] if data is None: data = "" return data[offset:offset + size]
class BlobSource(DataSource): def __init__(self): self.storage_account = getenv('STORAGE_ACCOUNT') self.blob_service = BlobService(self.storage_account, getenv('STORAGE_KEY')) def load(self, sparkContext, container, path): path = ('/' if path[0] != '/' else '') + path uri = 'wasb://%s@%s.blob.core.windows.net%s' % ( container, self.storage_account, path) print 'Loading from %s' % uri return sparkContext.textFile(uri) def download(self, container, path): print 'Downloading blob from %s/%s' % (container, path) self.blob_service.get_blob_to_path(container, path, path) print 'Downloaded blob to ' + path def saveAsJson(self, payload, container, path): path = path.lstrip('/') print path print 'Saving to %s/%s' % (container, path) json_string = json.dumps(payload, ensure_ascii=False).encode('utf-8') try: self.blob_service.put_blob( container, path, json_string, 'BlockBlob', x_ms_blob_cache_control='max-age=3600', x_ms_blob_content_type='application/json') except Exception as e: print 'Failed to save %s/%s: %s' % (container, path, str(e)) raise def saveAsText(self, rdd, container, path): path = path.lstrip('/') path = '/' + path print 'Saving rdd to %s%s' % (container, path) uri = 'wasb://%s@%s.blob.core.windows.net%s' % ( container, self.storage_account, path) try: rdd.saveAsTextFile(uri) except Exception as e: print 'Failed to save %s%s: %s' % (container, path, str(e)) raise def deleteAllBut(self, container, exceptFolderName): print 'deleteAllBut called' try: bloblistingresult = self.blob_service.list_blobs(container) for i in bloblistingresult: print i.name if not exceptFolderName in i.name: try: print 'deleting' self.blob_service.delete_blob(container, i.name) print 'deleted' except Exception as e: print 'Failed to delete %s/%s: %s' % (container, i.name, str(e)) raise except Exception as e: print 'Failed to list things in %s: %s' % (container, str(e)) raise
class AzureFS(LoggingMixIn, Operations): """Azure Blob Storage filesystem""" blobs = None containers = dict() # <cname, dict(stat:dict, #files:None|dict<fname, stat>) fds = dict() # <fd, (path, bytes, dirty)> fd = 0 def __init__(self, account, key): self.blobs = BlobService(account, key) self.rebuild_container_list() def convert_to_epoch(self, date): """Converts Tue, 31 Jul 2012 07:17:34 GMT format to epoch""" return int(time.mktime(time.strptime(date, TIME_FORMAT))) def rebuild_container_list(self): cmap = dict() cnames = set() for c in self.blobs.list_containers(): date = c.properties.last_modified cstat = dict(st_mode=(S_IFDIR | 0755), st_uid=getuid(), st_size=0, st_mtime=self.convert_to_epoch(date)) cname = c.name cmap['/' + cname] = dict(stat=cstat, files=None) cnames.add(cname) cmap['/'] = dict(files={}, stat=dict(st_mode=(S_IFDIR | 0755), st_uid=getuid(), st_size=0, st_mtime=int(time.time()))) self.containers = cmap # destroys fs tree cache resistant to misses def _parse_path(self, path): # returns </dir, file(=None)> if path.count('/') > 1: # file return str(path[:path.rfind('/')]), str(path[path.rfind('/') + 1:]) else: # dir pos = path.rfind('/', 1) if pos == -1: return path, None else: return str(path[:pos]), None def parse_container(self, path): base_container = path[1:] # /abc/def/g --> abc if base_container.find('/') > -1: base_container = base_container[:base_container.find('/')] return str(base_container) def _get_dir(self, path, contents_required=False): if not self.containers: self.rebuild_container_list() if path in self.containers and not (contents_required and \ self.containers[path]['files'] is None): return self.containers[path] cname = self.parse_container(path) if '/' + cname not in self.containers: raise FuseOSError(ENOENT) else: if self.containers['/' + cname]['files'] is None: # fetch contents of container log.info("------> CONTENTS NOT FOUND: %s" % cname) blobs = self.blobs.list_blobs(cname) dirstat = dict(st_mode=(S_IFDIR | 0755), st_size=0, st_uid=getuid(), st_mtime=time.time()) if self.containers['/' + cname]['files'] is None: self.containers['/' + cname]['files'] = dict() for f in blobs: blob_name = f.name blob_date = f.properties.last_modified blob_size = long(f.properties.content_length) node = dict(st_mode=(S_IFREG | 0644), st_size=blob_size, st_mtime=self.convert_to_epoch(blob_date), st_uid=getuid()) if blob_name.find('/') == -1: # file just under container self.containers['/' + cname]['files'][blob_name] = node return self.containers['/' + cname] return None def _get_file(self, path): d, f = self._parse_path(path) dir = self._get_dir(d, True) if dir is not None and f in dir['files']: return dir['files'][f] def getattr(self, path, fh=None): d, f = self._parse_path(path) if f is None: dir = self._get_dir(d) return dir['stat'] else: file = self._get_file(path) if file: return file raise FuseOSError(ENOENT) # FUSE def mkdir(self, path, mode): if path.count('/') <= 1: # create on root name = path[1:] if not 3 <= len(name) <= 63: log.error("Container names can be 3 through 63 chars long.") raise FuseOSError(ENAMETOOLONG) if name is not name.lower(): log.error("Container names cannot contain uppercase \ characters.") raise FuseOSError(EACCES) if name.count('--') > 0: log.error('Container names cannot contain consecutive \ dashes (-).') raise FuseOSError(EAGAIN) #TODO handle all "-"s must be preceded by letter or numbers #TODO starts with only letter or number, can contain letter, nr,'-' resp = self.blobs.create_container(name) if resp: self.rebuild_container_list() log.info("CONTAINER %s CREATED" % name) else: raise FuseOSError(EACCES) log.error("Invalid container name or container already \ exists.") else: raise FuseOSError(ENOSYS) # TODO support 2nd+ level mkdirs def rmdir(self, path): if path.count('/') == 1: c_name = path[1:] resp = self.blobs.delete_container(c_name) if resp: if path in self.containers: del self.containers[path] else: raise FuseOSError(EACCES) else: raise FuseOSError(ENOSYS) # TODO support 2nd+ level mkdirs def create(self, path, mode): node = dict(st_mode=(S_IFREG | mode), st_size=0, st_nlink=1, st_uid=getuid(), st_mtime=time.time()) d, f = self._parse_path(path) if not f: log.error("Cannot create files on root level: /") raise FuseOSError(ENOSYS) dir = self._get_dir(d, True) if not dir: raise FuseOSError(EIO) dir['files'][f] = node return self.open(path, data='') # reusing handler provider def open(self, path, flags=0, data=None): if data == None: # download contents c_name = self.parse_container(path) f_name = path[path.find('/', 1) + 1:] try: data = self.blobs.get_blob(c_name, f_name) except AzureMissingResourceHttpError: dir = self._get_dir('/' + c_name, True) if f_name in dir['files']: del dir['files'][f_name] raise FuseOSError(ENOENT) except AzureException as e: log.error("Read blob failed HTTP %d" % e.code) raise FuseOSError(EAGAIN) self.fd += 1 self.fds[self.fd] = (path, data, False) return self.fd def flush(self, path, fh=None): if not fh: raise FuseOSError(EIO) else: if fh not in self.fds: raise FuseOSError(EIO) path = self.fds[fh][0] data = self.fds[fh][1] dirty = self.fds[fh][2] if not dirty: return 0 # avoid redundant write d, f = self._parse_path(path) c_name = self.parse_container(path) if data is None: data = '' try: if len(data) < 64 * 1024 * 1024: # 64 mb self.blobs.put_blob(c_name, f, data, 'BlockBlob') else: # divide file by blocks and upload block_size = 8 * 1024 * 1024 num_blocks = int(math.ceil(len(data) * 1.0 / block_size)) rd = str(random.randint(1, 1e8)) block_ids = list() for i in range(num_blocks): part = data[i * block_size:min((i + 1) * block_size, len(data))] block_id = base64.encodestring( '%s_%s' % (rd, (8 - len(str(i))) * '0' + str(i))) self.blobs.put_block(c_name, f, part, block_id) block_ids.append(block_id) self.blobs.put_block_list(c_name, f, block_ids) except AzureException: raise FuseOSError(EAGAIN) dir = self._get_dir(d, True) if not dir or f not in dir['files']: raise FuseOSError(EIO) # update local data dir['files'][f]['st_size'] = len(data) dir['files'][f]['st_mtime'] = time.time() self.fds[fh] = (path, data, False) # mark as not dirty return 0 def release(self, path, fh=None): if fh is not None and fh in self.fds: del self.fds[fh] def truncate(self, path, length, fh=None): return 0 # assume done, no need def write(self, path, data, offset, fh=None): if not fh or fh not in self.fds: raise FuseOSError(ENOENT) else: d = self.fds[fh][1] if d is None: d = "" self.fds[fh] = (self.fds[fh][0], d[:offset] + data, True) return len(data) def unlink(self, path): c_name = self.parse_container(path) d, f = self._parse_path(path) try: self.blobs.delete_blob(c_name, f) _dir = self._get_dir(path, True) if _dir and f in _dir['files']: del _dir['files'][f] return 0 except AzureMissingResourceHttpError: raise FuseOSError(ENOENT) except Exception as e: raise FuseOSError(EAGAIN) def readdir(self, path, fh): if path == '/': return ['.', '..'] + [x[1:] for x in self.containers.keys() \ if x is not '/'] dir = self._get_dir(path, True) if not dir: raise FuseOSError(ENOENT) return ['.', '..'] + dir['files'].keys() def read(self, path, size, offset, fh): if not fh or fh not in self.fds: raise FuseOSError(ENOENT) f_name = path[path.find('/', 1) + 1:] c_name = path[1:path.find('/', 1)] try: data = self.blobs.get_blob(c_name, f_name) self.fds[fh] = (self.fds[fh][0], data, False) return data[offset:offset + size] except URLError, e: if e.code == 404: raise FuseOSError(ENOENT) elif e.code == 403: raise FUSEOSError(EPERM) else: log.error("Read blob failed HTTP %d" % e.code) raise FuseOSError(EAGAIN) data = self.fds[fh][1] if data is None: data = "" return data[offset:offset + size]
class AzureFS(LoggingMixIn, Operations): """ Azure Blob Storage filesystem """ blobs = None containers = dict() # {cname: {stat:dict, files:None|{fname: stat}} fd = 0 def __init__(self, account, key): self.blobs = BlobService(account, key) self._rebuild_container_list() def _rebuild_container_list(self): cmap = dict() cnames = set() for c in self.blobs.list_containers(): cstat = make_stat(stat.S_IFDIR | 0755, c.properties) cname = c.name cmap['/' + cname] = dict(stat=cstat, files=None) cnames.add(cname) cmap['/'] = dict(files={}, stat=make_stat(stat.S_IFDIR | 0755)) self.containers = cmap # destroys fs tree cache resistant to misses @staticmethod def _parse_path(path): # returns </dir, file(=None)> if path.count('/') > 1: # file return str(path[:path.rfind('/')]), str(path[path.rfind('/') + 1:]) else: # dir pos = path.rfind('/', 1) if pos == -1: return path, None else: return str(path[:pos]), None @staticmethod def _parse_container(path): base_container = path[1:] # /abc/def/g --> abc if base_container.find('/') > -1: base_container = base_container[:base_container.find('/')] return str(base_container) def _get_dir(self, path, contents_required=False, force=False): log.debug("get_dir: contents_required=%s, force=%s," " has_container=%s, path=%s", "t" if contents_required else "f", "t" if force else "f", "t" if path in self.containers else "f", path) cname = self._parse_container(path) if 'process' in self.containers['/' + cname] and \ self.containers['/' + cname]['process'] is not None: p = self.containers['/' + cname]['process'] if not p.is_alive(): p.join() self.containers['/' + cname]['process'] = None if not self.containers: log.info("get_dir: rebuilding container list") self._rebuild_container_list() if path in self.containers: container = self.containers[path] if not contents_required: return container if not force and container['files'] is not None: return container if '/' + cname not in self.containers: log.info("get_dir: no such container: /%s", cname) raise FuseOSError(errno.ENOENT) else: container = self.containers['/' + cname] try: log.info(">>>> %s - %s ", cname, container['process']) except KeyError: log.info(">>>> no process: %s " % cname) if container['files'] is None or force is True: # fetch contents of container log.info("Contents not found in the cache index: %s", cname) process = container.get('process', None) if process is not None and process.is_alive(): # We do nothing. Some thread is still working, # getting list of blobs from the container. log.info("Fetching blob list for '%s' is already" " handled by %s", cname, process) else: # No thread running for this container, launch a new one m = Manager() files = m.dict() process = Process(target=get_files_from_blob_service, args=(self.blobs, cname, files), name="list-blobs/%s" % cname) process.daemon = True process.start() container['process'] = process log.info("Started blob list retrieval for '%s': %s", cname, process) container['files'] = files return container def _get_file(self, path): d, f = self._parse_path(path) log.debug("get_file: requested path=%s (d=%s, f=%s)", path, d, f) directory = self._get_dir(d, True) files = None if directory is not None: files = directory['files'] if f in files: return files[f] if not hasattr(self, "_get_file_noent"): self._get_file_noent = {} last_check = self._get_file_noent.get(path, 0) if time.time() - last_check <= 30: # Negative TTL is 30 seconds (hardcoded for now) log.info("get_file: cache says to reply negative for %s", path) return None # Check if file now exists and our caches are just stale. try: c = self._parse_container(d) p = path[path.find('/', 1) + 1:] props = self.blobs.get_blob_properties(c, p) log.info("get_file: found locally unknown remote file %s: %s", path, repr(props)) node = make_stat(stat.S_IFREG | 0644, props) if node['st_size'] > 0: log.info("get_file: properties for %s: %s", path, repr(node)) # Remember this, so we won't have to re-query it. files[f] = node if path in self._get_file_noent: del self._get_file_noent[path] return node else: # TODO: FIXME: HACK: We currently ignore empty files. # Sometimes the file is not yet here and is still uploading. # Such files have "content-length: 0". Ignore those for now. log.warning("get_file: the file %s is not yet here (size=%s)", path, node['st_size']) self._get_file_noent[path] = time.time() return None except AzureMissingResourceHttpError: log.info("get_file: remote confirms non-existence of %s", path) self._get_file_noent[path] = time.time() return None except AzureException as e: log.error("get_file: exception while querying remote for %s: %s", path, repr(e)) self._get_file_noent[path] = time.time() return None def getattr(self, path, fh=None): log.debug("getattr: path=%s", path) d, f = self._parse_path(path) if f is None: return self._get_dir(d)['stat'] else: file_obj = self._get_file(path) if file_obj: return file_obj log.warning("getattr: no such file: %s", path) raise FuseOSError(errno.ENOENT) def mkdir(self, path, mode): if path.count('/') <= 1: # create on root name = path[1:] if not 3 <= len(name) <= 63: log.error("Container names can be 3 through 63 chars long") raise FuseOSError(errno.ENAMETOOLONG) if not re.match(RE_CONTAINER_NAME, name): log.error("Invalid container name: '%s'", name) raise FuseOSError(errno.EACCES) resp = self.blobs.create_container(name) if resp: self._rebuild_container_list() log.info("CONTAINER %s CREATED", name) else: log.error("Invalid container name or container already exists") raise FuseOSError(errno.EACCES) else: # TODO: Support 2nd+ level directory creation raise FuseOSError(errno.ENOSYS) def rmdir(self, path): if path.count('/') == 1: c_name = path[1:] resp = self.blobs.delete_container(c_name) if resp: if path in self.containers: del self.containers[path] else: raise FuseOSError(errno.EACCES) else: # TODO: Support 2nd+ level directories raise FuseOSError(errno.ENOSYS) def create(self, path, mode, fi=None): node = make_stat(stat.S_IFREG | mode) d, f = self._parse_path(path) if not f: log.error("Cannot create files on root level: /") raise FuseOSError(errno.ENOSYS) if f == ".__refresh_cache__": log.info("Refresh cache forced (%s)" % f) self._get_dir(path, True, True) return self.open(path, data='') directory = self._get_dir(d, True) if not directory: raise FuseOSError(errno.EIO) directory['files'][f] = node return self.open(path, data='') # reusing handler provider def open(self, path, flags=0, data=None): log.info("open: path=%s; flags=%s", path, flags) if data is None: # Download contents c_name = self._parse_container(path) f_name = path[path.find('/', 1) + 1:] try: self.blobs.get_blob_metadata(c_name, f_name) except AzureMissingResourceHttpError: directory = self._get_dir('/' + c_name, True) if f_name in directory['files']: del directory['files'][f_name] log.info("open: remote says there is no such file: c=%s f=%s", c_name, f_name) raise FuseOSError(errno.ENOENT) except AzureHttpError as e: log.error("Read blob failed with HTTP %d", e.status_code) raise FuseOSError(errno.EAGAIN) except AzureException as e: log.exception("Read blob failed with exception: %s", repr(e)) raise FuseOSError(errno.EAGAIN) self.fd += 1 return self.fd def truncate(self, path, length, fh=None): return 0 # assume done, no need def write(self, path, data, offset, fh=None): # TODO: Re-implement writing raise FuseOSError(errno.EPERM) def unlink(self, path): c_name = self._parse_container(path) d, f = self._parse_path(path) try: self.blobs.delete_blob(c_name, f) _dir = self._get_dir(path, True) if _dir and f in _dir['files']: del _dir['files'][f] return 0 except AzureMissingResourceHttpError: raise FuseOSError(errno.ENOENT) except: raise FuseOSError(errno.EAGAIN) def readdir(self, path, fh): if path == '/': return ['.', '..'] + [x[1:] for x in self.containers.keys() if x != '/'] directory = self._get_dir(path, True) if not directory: log.info("readdir: no such file: %s", path) raise FuseOSError(errno.ENOENT) return ['.', '..'] + directory['files'].keys() def read(self, path, size, offset, fh): f_name = path[path.find('/', 1) + 1:] c_name = path[1:path.find('/', 1)] try: byte_range = "bytes=%s-%s" % (offset, offset + size - 1) log.debug("read range: %s", byte_range) data = self.blobs.get_blob(c_name, f_name, snapshot=None, x_ms_range=byte_range) return data except AzureHttpError as e: if e.status_code == 404: raise FuseOSError(errno.ENOENT) elif e.status_code == 403: raise FuseOSError(errno.EPERM) else: log.error("Read blob failed with HTTP %d", e.status_code) raise FuseOSError(errno.EAGAIN) def statfs(self, path): return dict(f_bsize=4096, f_blocks=1, f_bavail=sys.maxint) def rename(self, old, new): # TODO: Implement renaming raise FuseOSError(errno.ENOSYS) def symlink(self, target, source): raise FuseOSError(errno.ENOSYS) def getxattr(self, path, name, position=0): return '' def chmod(self, path, mode): pass def chown(self, path, uid, gid): pass
# Get a handle on the Azure Blob Storage account azureStorage = BlobService(account_name=azureAccount, account_key=accountKey) # Create a datestring for the filenames dateString = date.today().strftime("%Y%m%d") # Create a filename string (e.g. Allergies20151103) targetFile = "{0}{1}".format(dataSetType, dateString) # Create full paths for the location targetIngestFullPath = "{0}/{1}.txt".format(targetIngestPath, targetFile) # Ensure a clean slate for pushing the new data set try: azureStorage.delete_blob(ingestContainer, targetIngestFullPath) theLog.write("Existing ingest blob found, deleting it\n\n") theLog.flush() except AzureMissingResourceHttpError: pass # Try to put the blob out in the wild, provide MD5 for error # checking since M$ didn't feel the need to implement a return # code for this function # On further testing, the "content_md5" is only for header rather # than the actual blob content - have to wait for these APIs to mature try: azureStorage.put_block_blob_from_path(ingestContainer, targetIngestFullPath, fullFilePath,
class Command(BaseCommand): help = "Synchronizes static media to cloud files." option_list = BaseCommand.option_list + ( optparse.make_option('-w', '--wipe', action='store_true', dest='wipe', default=False, help="Wipes out entire contents of container first."), optparse.make_option('-t', '--test-run', action='store_true', dest='test_run', default=False, help="Performs a test run of the sync."), optparse.make_option('-c', '--container', dest='container', help="Override STATIC_CONTAINER."), ) # settings from azurite.settings ACCOUNT_NAME = AZURITE['ACCOUNT_NAME'] ACCOUNT_KEY = AZURITE['ACCOUNT_KEY'] STATIC_CONTAINER = AZURITE['STATIC_CONTAINER'] # paths DIRECTORY = os.path.abspath(settings.STATIC_ROOT) STATIC_URL = settings.STATIC_URL if not DIRECTORY.endswith('/'): DIRECTORY = DIRECTORY + '/' if STATIC_URL.startswith('/'): STATIC_URL = STATIC_URL[1:] local_object_names = [] create_count = 0 upload_count = 0 update_count = 0 skip_count = 0 delete_count = 0 service = None def handle(self, *args, **options): self.wipe = options.get('wipe') self.test_run = options.get('test_run') self.verbosity = int(options.get('verbosity')) if hasattr(options, 'container'): self.STATIC_CONTAINER = options.get('container') self.sync_files() def sync_files(self): self.service = BlobService(account_name=self.ACCOUNT_NAME, account_key=self.ACCOUNT_KEY) try: self.service.get_container_properties(self.STATIC_CONTAINER) except AzureMissingResourceHttpError: self.service.create_container(self.STATIC_CONTAINER, x_ms_blob_public_access='blob') self.service.set_container_acl(self.STATIC_CONTAINER, x_ms_blob_public_access='blob') # if -w option is provided, wipe out the contents of the container if self.wipe: blob_count = len(self.service.list_blobs(self.STATIC_CONTAINER)) if self.test_run: print "Wipe would delete %d objects." % blob_count else: print "Deleting %d objects..." % blob_count for blob in self.service.list_blobs(self.STATIC_CONTAINER): self.service.delete_blob(self.STATIC_CONTAINER, blob.name) # walk through the directory, creating or updating files on the cloud os.path.walk(self.DIRECTORY, self.upload_files, "foo") # remove any files on remote that don't exist locally self.delete_files() # print out the final tally to the cmd line self.update_count = self.upload_count - self.create_count print if self.test_run: print "Test run complete with the following results:" print "Skipped %d. Created %d. Updated %d. Deleted %d." % ( self.skip_count, self.create_count, self.update_count, self.delete_count) def upload_files(self, arg, dirname, names): # upload or skip items for item in names: file_path = os.path.join(dirname, item) if os.path.isdir(file_path): continue # Don't try to upload directories object_name = self.STATIC_URL + file_path.split(self.DIRECTORY)[1] self.local_object_names.append(object_name) try: properties = self.service.get_blob_properties(self.STATIC_CONTAINER, object_name) except AzureMissingResourceHttpError: properties = {} self.create_count += 1 cloud_datetime = None if 'last-modified' in properties: cloud_datetime = (properties['last-modified'] and datetime.datetime.strptime( properties['last-modified'], "%a, %d %b %Y %H:%M:%S %Z" ) or None) local_datetime = datetime.datetime.utcfromtimestamp( os.stat(file_path).st_mtime) if cloud_datetime and local_datetime < cloud_datetime: self.skip_count += 1 if self.verbosity > 1: print "Skipped %s: not modified." % object_name continue if not self.test_run: file_contents = open(file_path, 'r').read() content_type, encoding = mimetypes.guess_type(file_path) self.service.put_blob(self.STATIC_CONTAINER, object_name, file_contents, x_ms_blob_type='BlockBlob', x_ms_blob_content_type=content_type, content_encoding=encoding) # sync_headers(cloud_obj) self.upload_count += 1 if self.verbosity > 1: print "Uploaded", object_name def delete_files(self): # remove any objects in the container that don't exist locally for blob in self.service.list_blobs(self.STATIC_CONTAINER): if blob.name not in self.local_object_names: self.delete_count += 1 if self.verbosity > 1: print "Deleted %s" % blob.name if not self.test_run: self.service.delete_blob(self.STATIC_CONTAINER, blob.name)
class Azure(object): ''' A class used to connect to the Azure storage and upload/download files using blob storage ''' def __init__(self, params={}): ''' Constructor for the Azure object ''' if "user" in params: self.user = params["user"] else: self.user = None if "key" in params: self.key = params["key"] else: self.key = None def connect(self, host, port, user, password, secure): ''' Connect to the Azure service with given user and key @param user - username to use to connect to @param key - key to use to connect ''' kwargs = {} err = None if not host is None: kwargs["host_base"] = "." + host if not user is None: kwargs["account_name"] = user elif not self.user is None: kwargs["account_name"] = self.user if not password is None: kwargs["account_key"] = password elif not self.key is None: kwargs["account_key"] = self.key kwargs["protocol"] = "https" if secure else "http" try: self.service = BlobService(**kwargs) except Exception as e: err = e.message self.service = None if self.service is None: raise OsakaException("Failed to connect to Azure:" + ("" if err is None else err)) @classmethod def getSchemes(clazz): ''' Returns a list of schemes this handler handles Note: handling the scheme of another handler produces unknown results @returns list of handled schemes ''' return ["azure", "azures"] def close(self): ''' Close this service ''' pass def put(self, path, url): ''' Put a file up to the cloud @param path - path to upload @param url - path in cloud to upload too ''' if os.path.isdir(path): return walk(self.put, path, url) cont, blob = get_container_and_path(urlparse.urlparse(url).path) self.service.create_container(cont) self.service.put_block_blob_from_path(cont, blob, path) return True def get(self, url, dest): ''' Get file(s) from the cloud @param url - url on cloud to pull down (on cloud) @param dest - dest to download too ''' cont, blob = get_container_and_path(urlparse.urlparse(url).path) for b in self.service.list_blobs(cont, prefix=blob): destination = os.path.join(dest, os.path.relpath( b.name, blob)) if blob != b.name else dest if not os.path.exists(os.path.dirname(destination)): os.mkdir(os.path.dirname(destination)) self.service.get_blob_to_path(cont, b.name, destination) return True def rm(self, url): ''' Remove this url and all children urls @param url - url to remove ''' cont, blob = get_container_and_path(urlparse.urlparse(url).path) for b in self.service.list_blobs(cont, prefix=blob): self.service.delete_blob(cont, b.name) return True