def copy_output(step_id: str, env: Env): accounturl = "https://{}.blob.core.windows.net".format( env.scoring_datastore_storage_name) srcblobname = "azureml/{}/{}_out/parallel_run_step.txt".format( step_id, env.scoring_datastore_storage_name) srcbloburl = "{}/{}/{}".format(accounturl, env.scoring_datastore_output_container, srcblobname) containerclient = ContainerClient( accounturl, env.scoring_datastore_output_container, env.scoring_datastore_access_key, ) srcblobproperties = containerclient.get_blob_client( srcblobname).get_blob_properties() # noqa E501 destfolder = srcblobproperties.last_modified.date().isoformat() filetime = (srcblobproperties.last_modified.time().isoformat( "milliseconds").replace(":", "_").replace(".", "_")) # noqa E501 destfilenameparts = env.scoring_datastore_output_filename.split(".") destblobname = "{}/{}_{}.{}".format(destfolder, destfilenameparts[0], filetime, destfilenameparts[1]) destblobclient = containerclient.get_blob_client(destblobname) destblobclient.start_copy_from_url(srcbloburl)
class ConsolidateServicoOperator(BaseOperator): @apply_defaults def __init__(self, account_name, account_key, container, listfiles, dir_out, nm_out, extension, previous_task, *args, **kwargs) : super (ConsolidateServicoOperator, self).__init__ (*args, **kwargs) self.client = ContainerClient (account_url = f"https://{account_name}.blob.core.windows.net/", credential = account_key, container_name = container) self.listfiles = listfiles self.extension = extension self.dir_out = dir_out self.nm_out = nm_out self.previous_task = previous_task self.temp_dir = _TEMP_FILE def execute(self,context): listrun = context['ti'].xcom_pull (task_ids = self.previous_task) listrun = listrun[0]+listrun[1]+listrun[2] intersect = [file for file in listrun if file in self.listfiles] self.logger.info (f'list intersect {intersect}') if len(intersect)>0: self._download_local() ldf = self._read_file() localfile = f'{self.temp_dir}/{self.nm_out}.{self.extension}' outfile = f'{self.dir_out}/{self.nm_out}.{self.extension}' processor.consolidateservico(ldf).to_pickle(localfile) with open (localfile, "rb") as data_to : self.client.get_blob_client (outfile).upload_blob (data_to, overwrite = True) os.remove (localfile) def _download_local(self): for nm_file in self.listfiles : file = nm_file.split ('/')[1] self.logger.info (f'run {nm_file}') with open (f'{self.temp_dir}/{file}', 'wb') as data_from : data_from.write (self.client.get_blob_client (nm_file).download_blob ().readall ()) self.logger.info (f'write local file {nm_file}') def _read_file(self): ldf = [] for nm_file in self.listfiles : file = f"{self.temp_dir}/{nm_file.split ('/')[1]}" df = pd.read_pickle(file) os.remove (file) ldf.append(df) return ldf
def copy_output(args): print("Output : {}".format(args.output_path)) accounturl = "https://{}.blob.core.windows.net".format( args.scoring_datastore ) # NOQA E501 containerclient = ContainerClient( accounturl, args.score_container, args.scoring_datastore_key ) destfolder = date.today().isoformat() filetime = ( datetime.now(timezone.utc) .time() .isoformat("milliseconds") .replace(":", "_") .replace(".", "_") ) # noqa E501 destfilenameparts = args.scoring_output_filename.split(".") destblobname = "{}/{}_{}.{}".format( destfolder, destfilenameparts[0], filetime, destfilenameparts[1] ) destblobclient = containerclient.get_blob_client(destblobname) with open( os.path.join(args.output_path, "parallel_run_step.txt"), "rb" ) as scorefile: # noqa E501 destblobclient.upload_blob(scorefile, blob_type="BlockBlob")
def copy_output(step_id, env): account_url = f'https://{env.scoring_datastore_storage_name}.blob.core.windows.net' src_blob_name = f'azureml/{step_id}/{env.scoring_datastore_storage_name}_out/parallel_run_step.txt' src_blob_url = f'{account_url}/{env.scoring_datastore_output_container}/{src_blob_name}' container_client = ContainerClient(account_url=account_url, container_name=env.scoring_datastore_output_container, credential=env.scoring_datastore_access_key) src_blob_properties = container_client.get_blob_client(src_blob_name).get_blob_properties() destfolder = src_blob_properties.last_modified.date().isoformat() file_time = (src_blob_properties.last_modified.time()).isoformat('milliseconds').replace(':','_').replace('.','_') filename_parts = env.scoring_datastore_output_filename.split('.') dest_blob_name = f'{destfolder}/{filename_parts[0]}_{file_time}.{filename_parts[1]}' dest_client = container_client.get_blob_client(dest_blob_name) dest_client.start_copy_from_url(src_blob_url)
def load_data(): """Function to load the input data from blob storage. """ account_url = "https://hecdf.blob.core.windows.net" facts_blob_service = ContainerClient(account_url=account_url, container_name=facts_container, credential=facts_sas_token) print('````````````````````````````````````') print(' Begin loding data...') print('````````````````````````````````````') for blob in list(facts_blob_service.list_blobs()): file_name = blob.name print(file_name) download_stream = facts_blob_service.get_blob_client( file_name).download_blob() Path(f'./data/raw_in/{file_name}').parent.mkdir(parents=True, exist_ok=True) with open(f"./data/raw_in/{file_name}", "wb") as data: data.write(download_stream.readall()) print('````````````````````````````````````') print(' Finished loading data!') print('````````````````````````````````````') return 0
def upload_blob( container: ContainerClient, blob_name: str, content_type: str, content_encoding: str, data: Any, return_sas_token: bool = True, ) -> str: """ Uploads the given data to a blob record. If a blob with the given name already exist, it throws an error. Returns a uri with a SAS token to access the newly created blob. """ create_container_using_client(container) logger.info(f"Uploading blob '{blob_name}'" + f"to container '{container.container_name}'" + f"on account: '{container.account_name}'") content_settings = ContentSettings(content_type=content_type, content_encoding=content_encoding) blob = container.get_blob_client(blob_name) blob.upload_blob(data, content_settings=content_settings) logger.debug(f" - blob '{blob_name}' uploaded. generating sas token.") if return_sas_token: uri = get_blob_uri_with_sas_token(blob) else: uri = remove_sas_token(blob.url) logger.debug(f" - blob access url: '{uri}'.") return uri
def combine_azure(self): from azure.storage.blob import ContainerClient, ContentSettings feed_uri = self.settings.get("FEED_URI") feed_prefix = self.settings.get("CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d") account_name, account_key = feed_uri[8::].split("@")[0].split(":") container = feed_uri.split("@")[1].split("/")[0] container_client = ContainerClient( "{}.blob.core.windows.net".format(account_name), container, credential=account_key, ) max_days_previous = 3 days_previous = 0 prefix_blobs = [] while days_previous <= max_days_previous: prefix_blobs = [ blob for blob in container_client.list_blobs( name_starts_with=( datetime.now() - timedelta(days=days_previous) ).strftime(feed_prefix) ) ] if len(prefix_blobs) > 0: break days_previous += 1 spider_blob_names = self.get_spider_paths([blob.name for blob in prefix_blobs]) meetings = [] for blob_name in spider_blob_names: feed_blob = container_client.get_blob_client(blob_name) feed_text = feed_blob.download_blob().content_as_text() meetings.extend( [json.loads(line) for line in feed_text.split("\n") if line] ) meetings = sorted(meetings, key=itemgetter(self.start_key)) yesterday_iso = (datetime.now() - timedelta(days=1)).isoformat()[:19] upcoming = [ meeting for meeting in meetings if meeting[self.start_key][:19] > yesterday_iso ] container_client.upload_blob( "latest.json", "\n".join([json.dumps(meeting) for meeting in meetings]), content_settings=ContentSettings(cache_control="no-cache"), overwrite=True, ) container_client.upload_blob( "upcoming.json", "\n".join([json.dumps(meeting) for meeting in upcoming]), content_settings=ContentSettings(cache_control="no-cache"), overwrite=True, )
def __init__(self, container: ContainerClient, blob_name: str, content_type: str, content_encoding: str): self.container = container self.blob_name = blob_name self.content_settings = ContentSettings( content_type=content_type, content_encoding=content_encoding) self.state = StreamedBlobState.not_initialized self.blob = container.get_blob_client(blob_name) self.blocks = []
def from_blobs_to_excel(blobs: Iterator[BlobProperties], container_client: ContainerClient) -> Dict[str, Sheet]: sheets = {} for blob in blobs: if Path(blob.name).suffix != ".xlsx": continue blob_client = container_client.get_blob_client(blob) raw_blob = blob_client.download_blob().readall() product_id = sanitize_row_key(Path(blob.name).stem) sheets[product_id] = excel_raw_file_to_sheet(raw_blob) return sheets
class DeleteFilesAzure2AzureOperator(BaseOperator): @apply_defaults def __init__(self, account_name, account_key, container, *args, **kwargs): super(DeleteFilesAzure2AzureOperator, self).__init__(*args, **kwargs) self.client = ContainerClient( account_url=f"https://{account_name}.blob.core.windows.net/", credential=account_key, container_name=container) def execute(self, context): for file in self.client.list_blobs(): bob_cilent = self.client.get_blob_client(file) bob_cilent.delete_blob()
def upload_blob(container_client: ContainerClient, filename: str, azure_filename: str, tier: StandardBlobTier, update=False, overwrite=False, retries=0, debug=False ) -> dict: ''' Upload a file as a blob to the cloud, there is checking to see if the md5sum matches if its already uploaded, by tagging the md5 in the metadata. ''' #TODO: Make this log better, more readable, kinda a mess rn file_md5 = get_md5sum(filename) operation = {'operation': 'no-op'} # Default return blob_client = container_client.get_blob_client(azure_filename) if update: blob_properties = blob_client.get_blob_properties() try: blob_md5 = blob_properties['metadata']['md5'] except KeyError: blob_md5 = '' log.info(f"Already in container. {azure_filename} cloud md5: {blob_md5}, {filename} local md5: {file_md5}") if file_md5 != blob_md5: # TODO: Reorder to be cleaner if overwrite: log.info(f'MD5sum Mismatch - Sending local copy of {filename}') blob_client.delete_blob() with open(filename, 'rb') as data: operation = blob_client.upload_blob( data, standard_blob_tier=tier, metadata={'md5': file_md5} ) else: log.info(f'MD5Sum Mismatch - Set not to overwrite. Will not send {filename}') else: log.info(f'MD5Sums Match - no-op') else: log.info(f'{filename} not found in container, sending local file.') with open(filename, 'rb') as data: operation = blob_client.upload_blob( data, standard_blob_tier=tier, metadata={'md5': file_md5} ) log.info(f"Uploaded: {filename}, request_id: {operation['request_id']}" ) return operation
def name_changer( container_client: ContainerClient, old_name: str, new_name: str, remove_old_blobs: bool, ) -> None: """Change the name of the given blob.""" logger.info(f"Start changing blobname from {old_name} to {new_name}.") blob_client_old = container_client.get_blob_client(blob=old_name) buffer = blob_client_old.download_blob().readall() # Upload blob with new name blob_client_new = container_client.get_blob_client(blob=new_name) blob_client_new.upload_blob(buffer, overwrite=True) # Delte old blob if remove_old_blobs: blob_client_old.delete_blob() logger.info(f"Succesfully changed blobname from {old_name} to {new_name}.") return
def init_blob_for_streaming_upload( container: ContainerClient, blob_name: str, content_type: str, content_encoding: str, data: Any, return_sas_token: bool = True, ) -> str: """ Uploads the given data to a blob record. If a blob with the given name already exist, it throws an error. Returns a uri with a SAS token to access the newly created blob. """ create_container_using_client(container) logger.info( f"Streaming blob '{blob_name}'" + f"to container '{container.container_name}' on account:" + f"'{container.account_name}'" ) content_settings = ContentSettings( content_type=content_type, content_encoding=content_encoding ) blob = container.get_blob_client(blob_name) blob.stage_block() blob.commit_block_list() blob.upload_blob(data, content_settings=content_settings) logger.debug(f" - blob '{blob_name}' uploaded. generating sas token.") if return_sas_token: sas_token = generate_blob_sas( blob.account_name, blob.container_name, blob.blob_name, account_key=blob.credential.account_key, permission=BlobSasPermissions(read=True), expiry=datetime.utcnow() + timedelta(days=14), ) uri = blob.url + "?" + sas_token else: uri = remove_sas_token(blob.url) logger.debug(f" - blob access url: '{uri}'.") return uri
def append_blob( container: ContainerClient, blob_name: str, content_type: str, content_encoding: str, data: Any, return_sas_token: bool = True, metadata: Dict[str, str] = None, ) -> str: """ Uploads the given data to a blob record. If a blob with the given name already exist, it throws an error. Returns a uri with a SAS token to access the newly created blob. """ create_container_using_client(container) logger.info( f"Appending data to blob '{blob_name}'" + f"in container '{container.container_name}'" + f"on account: '{container.account_name}'" ) content_settings = ContentSettings( content_type=content_type, content_encoding=content_encoding ) blob = container.get_blob_client(blob_name) try: props = blob.get_blob_properties() if props.blob_type != BlobType.AppendBlob: raise Exception("blob must be an append blob") except exceptions.ResourceNotFoundError: props = blob.create_append_blob( content_settings=content_settings, metadata=metadata ) blob.append_block(data, len(data)) logger.debug(f" - blob '{blob_name}' appended. generating sas token.") if return_sas_token: uri = get_blob_uri_with_sas_token(blob) else: uri = remove_sas_token(blob.url) logger.debug(f" - blob access url: '{uri}'.") return uri
class AzureDiffPipeline(DiffPipeline): """Azure Blob Storage backend for comparing previously scraped JSCalendar outputs""" def __init__(self, crawler, output_format): from azure.storage.blob import ContainerClient feed_uri = crawler.settings.get("FEED_URI") account_name, account_key = feed_uri[8::].split("@")[0].split(":") self.spider = crawler.spider self.container = feed_uri.split("@")[1].split("/")[0] self.container_client = ContainerClient( "{}.blob.core.windows.net".format(account_name), self.container, credential=account_key, ) self.feed_prefix = crawler.settings.get( "CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d" ) super().__init__(crawler, output_format) def load_previous_results(self): max_days_previous = 3 days_previous = 0 tz = timezone(self.spider.timezone) while days_previous <= max_days_previous: matching_blobs = self.container_client.list_blobs( name_starts_with=( tz.localize(datetime.now()) - timedelta(days=days_previous) ).strftime(self.feed_prefix) ) spider_blobs = [ blob for blob in matching_blobs if "{}.".format(self.spider.name) in blob.name ] if len(spider_blobs) > 0: break days_previous += 1 if len(spider_blobs) == 0: return [] blob = sorted(spider_blobs, key=attrgetter("name"))[-1] feed_blob = self.container_client.get_blob_client(blob.name) feed_text = feed_blob.download_blob().content_as_text() return [json.loads(line) for line in feed_text.split("\n") if line.strip()]
def upload_data(filename: str, data: dict, cc: ContainerClient) -> None: """Upload software maturity level report to Azure blob storage. Args: filename: Full path of uploaded file. data: Data to publish to Azure. cc: Azure container client. Raises: azure.core.exceptions.ClientAuthenticationError: Invalid access keys. """ data = yaml.safe_dump(data) bc = cc.get_blob_client(filename) logger.info("Uploading software maturity level information...") bc.upload_blob(data, overwrite=True) logger.info("Upload complete")
def test_container_client_api_version_property(self): container_client = ContainerClient( "https://foo.blob.core.windows.net/account", self.container_name, credential="fake_key") self.assertEqual(container_client.api_version, self.api_version_2) self.assertEqual(container_client._client._config.version, self.api_version_2) container_client = ContainerClient( "https://foo.blob.core.windows.net/account", self.container_name, credential="fake_key", api_version=self.api_version_1) self.assertEqual(container_client.api_version, self.api_version_1) self.assertEqual(container_client._client._config.version, self.api_version_1) blob_client = container_client.get_blob_client("foo") self.assertEqual(blob_client.api_version, self.api_version_1) self.assertEqual(blob_client._client._config.version, self.api_version_1)
def startProcessing(): print('Processor started using path: ' + os.getcwd()) # Create a blob container client. container = ContainerClient.from_connection_string(STORAGE_CONNECTION_STRING, container_name=STORAGE_CONTAINER_NAME) blob_list = container.list_blobs() # List all the blobs in the container. for blob in blob_list: # Content_length == 508 is an empty file, so process only content_length > 508 (skip empty files). if blob.size > 508: print('Downloaded a non empty blob: ' + blob.name) # Create a blob client for the blob. blob_client = ContainerClient.get_blob_client(container, blob=blob.name) # Construct a file name based on the blob name. cleanName = str.replace(blob.name, '/', '_') cleanName = os.getcwd() + '\\' + cleanName with open(cleanName, "wb+") as my_file: # Open the file to write. Create it if it doesn't exist. my_file.write(blob_client.download_blob().readall()) # Write blob contents into the file. try: processBlob2(cleanName) # Convert the file into a CSV file. except: print('test') os.remove(cleanName) # Remove the original downloaded file. # Delete the blob from the container after it's read. container.delete_blob(blob.name)
class BlobStorageUploader(Uploader): def __init__(self, cfg: Configuration): super().__init__(cfg) blob_cfg = cfg[BLOB_STORAGE_CONFIG_KEY] account_name = blob_cfg["accountName"] account_key = blob_cfg["accountKey"] container_name = blob_cfg["containerName"] self.__container_client = ContainerClient( f"https://{account_name}.blob.core.windows.net/", container_name, account_key, retry_total=1, connection_timeout=5) def upload(self, clip: Clip) -> bool: dir = f"{clip.date.year}/{clip.date.month}/{clip.date.day}" if clip.event != None else "recent" blob_name = f"{dir}/{clip.name}" blob = self.__container_client.get_blob_client(blob_name) try: blob.get_blob_properties() return True except ResourceNotFoundError: return self.__perform_upload(clip, blob) except ServiceRequestError: return False def __perform_upload(self, clip: Clip, blob: BlobClient) -> bool: try: with open(clip.path, "rb") as data: blob.upload_blob(data) return True except: return False
class TartanAir(object): def on_start(self): """This function is called once the Block is started """ account_url = 'https://tartanair.blob.core.windows.net/' container_name = 'tartanair-release1' self.container_client = ContainerClient(account_url=account_url, container_name=container_name, credential=None) self.envlist = [ 'abandonedfactory/', 'abandonedfactory_night/', 'amusement/', 'carwelding/', 'endofworld/', 'gascola/', 'hospital/', 'japanesealley/', 'neighborhood/', 'ocean/', 'office/', 'office2/', 'oldtown/', 'seasidetown/', 'seasonsforest/', 'seasonsforest_winter/', 'soulcity/', 'westerndesert/' ] self.diff_level = ["Easy", "Hard"][int(self.get_property("diff_level"))] self.env_ind = self.get_property("env_ind") self.trajlist = self.get_trajectory_list(self.envlist[self.env_ind], easy_hard=self.diff_level) self.trajs_len = len(self.trajlist) self.traj_id = self.get_property("traj_id") self.alert( "Selected Environment: {}".format(self.envlist[self.env_ind]), "INFO") self.alert("Difficulty Level: {}".format(self.diff_level), "INFO") self.alert( "Number of available trajectories: {}".format(self.trajs_len), "INFO") if (self.traj_id >= self.trajs_len): self.alert( "Trajectory id out of range[0, {}]".format(self.trajs_len - 1), "ERROR") self.frequency = self.get_property("fps") self.traj_dir = self.trajlist[self.traj_id] # Load Images List self.left_img_list = self.get_image_list(self.traj_dir, left_right='left') print('Find {} left images in {}'.format(len(self.left_img_list), self.traj_dir)) self.right_img_list = self.get_image_list(self.traj_dir, left_right='right') self.left_depth_list = self.get_depth_list(self.traj_dir, left_right='left') self.right_depth_list = self.get_depth_list(self.traj_dir, left_right='right') self.left_seg_list = self.get_seg_list(self.traj_dir, left_right='left') self.right_seg_list = self.get_seg_list(self.traj_dir, left_right='left') self.flow_list = self.get_flow_list(self.traj_dir) self.flow_mask_list = self.get_flow_mask_list(self.traj_dir) self.left_pose_file = self.get_posefile(self.traj_dir, left_right='left') self.right_pose_file = self.get_posefile(self.traj_dir, left_right='right') # Load poses bc = self.container_client.get_blob_client(blob=self.left_pose_file) data = bc.download_blob() text_file = open("OutputL.txt", "w") text_file.write(data.content_as_text()) text_file.close() self.pose_l = np.loadtxt("OutputL.txt") bc = self.container_client.get_blob_client(blob=self.right_pose_file) data = bc.download_blob() text_file = open("OutputR.txt", "w") text_file.write(data.content_as_text()) text_file.close() self.pose_r = np.loadtxt("OutputR.txt") def run(self): ltime = time.time() idx = 0 while True: if (time.time() - ltime >= 1 / self.frequency): if (idx == len(self.left_img_list)): idx = 0 # RGB Images left_img = self.read_image_file(self.left_img_list[idx]) right_img = self.read_image_file(self.right_img_list[idx]) header = Header() set_timestamp(header, time.time()) header.frame_id = "left_img" left_msg = from_ndarray(left_img, header) self.publish("left_img", left_msg) header.frame_id = "right_img" right_msg = from_ndarray(right_img, header) self.publish("right_img", right_msg) # Depth Images left_depth = self.read_numpy_file(self.left_depth_list[idx]) left_depth_vis = depth2vis(left_depth) header.frame_id = "left_depth" left_msg = from_ndarray(left_depth_vis, header) self.publish("left_depth", left_msg) right_depth = self.read_numpy_file(self.right_depth_list[idx]) right_depth_vis = depth2vis(right_depth) header.frame_id = "right_depth" right_msg = from_ndarray(right_depth_vis, header) self.publish("right_depth", right_msg) # Semantic Segmentation left_seg = self.read_numpy_file(self.left_seg_list[idx]) left_seg_vis = seg2vis(left_seg) header.frame_id = "left_segmentation" left_msg = from_ndarray(left_seg_vis, header) self.publish("left_segmentation", left_msg) right_seg = self.read_numpy_file(self.right_seg_list[idx]) right_seg_vis = seg2vis(right_seg) header.frame_id = "right_segmentation" right_msg = from_ndarray(right_seg_vis, header) self.publish("right_segmentation", right_msg) # Left Camera Pose pose_stamped = PoseStamped() pose_stamped.header = header pose_stamped.header.frame_id = "left_camera" pose = Pose() pose.position.x = self.pose_l[idx][0] pose.position.y = self.pose_l[idx][1] pose.position.z = self.pose_l[idx][2] pose.orientation.x = self.pose_l[idx][3] pose.orientation.y = self.pose_l[idx][4] pose.orientation.z = self.pose_l[idx][5] pose.orientation.w = self.pose_l[idx][6] pose_stamped.pose = pose self.publish("left_pose", pose_stamped) # Right Camera Pose pose_stamped = PoseStamped() pose_stamped.header = header pose_stamped.header.frame_id = "right_camera" pose = Pose() pose.position.x = self.pose_r[idx][0] pose.position.y = self.pose_r[idx][1] pose.position.z = self.pose_r[idx][2] pose.orientation.x = self.pose_r[idx][3] pose.orientation.y = self.pose_r[idx][4] pose.orientation.z = self.pose_r[idx][5] pose.orientation.w = self.pose_r[idx][6] pose_stamped.pose = pose self.publish("right_pose", pose_stamped) if (idx > 0): flow = self.read_numpy_file(self.flow_list[idx - 1]) flow_vis = flow2vis(flow) header.frame_id = "optical_flow" left_msg = from_ndarray(flow_vis, header) self.publish("optical_flow", left_msg) flow_mask = self.read_numpy_file(self.flow_mask_list[idx - 1]) flow_vis_w_mask = flow2vis(flow, mask=flow_mask) header.frame_id = "optical_flow_mask" right_msg = from_ndarray(flow_vis_w_mask, header) self.publish("optical_flow_mask", right_msg) ltime = time.time() idx += 1 def on_properties_changed(self, affected_properties): self.on_start() def get_environment_list(self): ''' List all the environments shown in the root directory ''' env_gen = self.container_client.walk_blobs() envlist = [] for env in env_gen: envlist.append(env.name) return envlist def get_trajectory_list(self, envname, easy_hard='Easy'): ''' List all the trajectory folders, which is named as 'P0XX' ''' assert (easy_hard == 'Easy' or easy_hard == 'Hard') traj_gen = self.container_client.walk_blobs(name_starts_with=envname + '/' + easy_hard + '/') trajlist = [] for traj in traj_gen: trajname = traj.name trajname_split = trajname.split('/') trajname_split = [tt for tt in trajname_split if len(tt) > 0] if trajname_split[-1][0] == 'P': trajlist.append(trajname) return trajlist def _list_blobs_in_folder(self, folder_name): """ List all blobs in a virtual folder in an Azure blob container """ files = [] generator = self.container_client.list_blobs( name_starts_with=folder_name) for blob in generator: files.append(blob.name) return files def get_image_list(self, trajdir, left_right='left'): assert (left_right == 'left' or left_right == 'right') files = self._list_blobs_in_folder(trajdir + '/image_' + left_right + '/') files = [fn for fn in files if fn.endswith('.png')] return files def get_depth_list(self, trajdir, left_right='left'): assert (left_right == 'left' or left_right == 'right') files = self._list_blobs_in_folder(trajdir + '/depth_' + left_right + '/') files = [fn for fn in files if fn.endswith('.npy')] return files def get_flow_list( self, trajdir, ): files = self._list_blobs_in_folder(trajdir + '/flow/') files = [fn for fn in files if fn.endswith('flow.npy')] return files def get_flow_mask_list( self, trajdir, ): files = self._list_blobs_in_folder(trajdir + '/flow/') files = [fn for fn in files if fn.endswith('mask.npy')] return files def get_posefile(self, trajdir, left_right='left'): assert (left_right == 'left' or left_right == 'right') return trajdir + '/pose_' + left_right + '.txt' def get_seg_list(self, trajdir, left_right='left'): assert (left_right == 'left' or left_right == 'right') files = self._list_blobs_in_folder(trajdir + '/seg_' + left_right + '/') files = [fn for fn in files if fn.endswith('.npy')] return files def read_numpy_file( self, numpy_file, ): ''' return a numpy array given the file path ''' bc = self.container_client.get_blob_client(blob=numpy_file) data = bc.download_blob() ee = io.BytesIO(data.content_as_bytes()) ff = np.load(ee) return ff def read_image_file( self, image_file, ): ''' return a uint8 numpy array given the file path ''' bc = self.container_client.get_blob_client(blob=image_file) data = bc.download_blob() ee = io.BytesIO(data.content_as_bytes()) img = cv2.imdecode(np.asarray(bytearray(ee.read()), dtype=np.uint8), cv2.IMREAD_COLOR) # im_rgb = img[:, :, [2, 1, 0]] # BGR2RGB return img
class AzureCloudInterface(CloudInterface): # Azure block blob limitations # https://docs.microsoft.com/en-us/rest/api/storageservices/understanding-block-blobs--append-blobs--and-page-blobs MAX_CHUNKS_PER_FILE = 50000 # Minimum block size allowed in Azure Blob Storage is 64KB MIN_CHUNK_SIZE = 64 << 10 # Azure Blob Storage permit a maximum of 4.75TB per file # This is a hard limit, while our upload procedure can go over the specified # MAX_ARCHIVE_SIZE - so we set a maximum of 1TB per file MAX_ARCHIVE_SIZE = 1 << 40 # The size of each chunk in a single object upload when the size of the # object exceeds max_single_put_size. We default to 2MB in order to # allow the default max_concurrency of 8 to be achieved when uploading # uncompressed WAL segments of the default 16MB size. DEFAULT_MAX_BLOCK_SIZE = 2 << 20 # The maximum amount of concurrent chunks allowed in a single object upload # where the size exceeds max_single_put_size. We default to 8 based on # experiments with in-region and inter-region transfers within Azure. DEFAULT_MAX_CONCURRENCY = 8 # The largest file size which will be uploaded in a single PUT request. This # should be lower than the size of the compressed WAL segment in order to # force the Azure client to use concurrent chunk upload for archiving WAL files. DEFAULT_MAX_SINGLE_PUT_SIZE = 4 << 20 # The maximum size of the requests connection pool used by the Azure client # to upload objects. REQUESTS_POOL_MAXSIZE = 32 def __init__( self, url, jobs=2, encryption_scope=None, credential=None, tags=None, max_block_size=DEFAULT_MAX_BLOCK_SIZE, max_concurrency=DEFAULT_MAX_CONCURRENCY, max_single_put_size=DEFAULT_MAX_SINGLE_PUT_SIZE, ): """ Create a new Azure Blob Storage interface given the supplied account url :param str url: Full URL of the cloud destination/source :param int jobs: How many sub-processes to use for asynchronous uploading, defaults to 2. """ super(AzureCloudInterface, self).__init__( url=url, jobs=jobs, tags=tags, ) self.encryption_scope = encryption_scope self.credential = credential self.max_block_size = max_block_size self.max_concurrency = max_concurrency self.max_single_put_size = max_single_put_size parsed_url = urlparse(url) if parsed_url.netloc.endswith(AZURE_BLOB_STORAGE_DOMAIN): # We have an Azure Storage URI so we use the following form: # <http|https>://<account-name>.<service-name>.core.windows.net/<resource-path> # where <resource-path> is <container>/<blob>. # Note that although Azure supports an implicit root container, we require # that the container is always included. self.account_url = parsed_url.netloc try: self.bucket_name = parsed_url.path.split("/")[1] except IndexError: raise ValueError("azure blob storage URL %s is malformed" % url) path = parsed_url.path.split("/")[2:] else: # We are dealing with emulated storage so we use the following form: # http://<local-machine-address>:<port>/<account-name>/<resource-path> logging.info("Using emulated storage URL: %s " % url) if "AZURE_STORAGE_CONNECTION_STRING" not in os.environ: raise ValueError( "A connection string must be provided when using emulated storage" ) try: self.bucket_name = parsed_url.path.split("/")[2] except IndexError: raise ValueError("emulated storage URL %s is malformed" % url) path = parsed_url.path.split("/")[3:] self.path = "/".join(path) self.bucket_exists = None self._reinit_session() def _reinit_session(self): """ Create a new session """ if self.credential: # Any supplied credential takes precedence over the environment credential = self.credential elif "AZURE_STORAGE_CONNECTION_STRING" in os.environ: logging.info("Authenticating to Azure with connection string") self.container_client = ContainerClient.from_connection_string( conn_str=os.getenv("AZURE_STORAGE_CONNECTION_STRING"), container_name=self.bucket_name, ) return else: if "AZURE_STORAGE_SAS_TOKEN" in os.environ: logging.info("Authenticating to Azure with SAS token") credential = os.getenv("AZURE_STORAGE_SAS_TOKEN") elif "AZURE_STORAGE_KEY" in os.environ: logging.info("Authenticating to Azure with shared key") credential = os.getenv("AZURE_STORAGE_KEY") else: logging.info( "Authenticating to Azure with default credentials") # azure-identity is not part of azure-storage-blob so only import # it if needed try: from azure.identity import DefaultAzureCredential except ImportError: raise SystemExit( "Missing required python module: azure-identity") credential = DefaultAzureCredential() session = requests.Session() adapter = requests.adapters.HTTPAdapter( pool_maxsize=self.REQUESTS_POOL_MAXSIZE) session.mount("https://", adapter) self.container_client = ContainerClient( account_url=self.account_url, container_name=self.bucket_name, credential=credential, max_single_put_size=self.max_single_put_size, max_block_size=self.max_block_size, session=session, ) @property def _extra_upload_args(self): optional_args = {} if self.encryption_scope: optional_args["encryption_scope"] = self.encryption_scope return optional_args def test_connectivity(self): """ Test Azure connectivity by trying to access a container """ try: # We are not even interested in the existence of the bucket, # we just want to see if Azure blob service is reachable. self.bucket_exists = self._check_bucket_existence() return True except (HttpResponseError, ServiceRequestError) as exc: logging.error("Can't connect to cloud provider: %s", exc) return False def _check_bucket_existence(self): """ Chck Azure Blob Storage for the target container Although there is an `exists` function it cannot be called by container-level shared access tokens. We therefore check for existence by calling list_blobs on the container. :return: True if the container exists, False otherwise :rtype: bool """ try: self.container_client.list_blobs().next() except ResourceNotFoundError: return False except StopIteration: # The bucket is empty but it does exist pass return True def _create_bucket(self): """ Create the container in cloud storage """ # By default public access is disabled for newly created containers. # Unlike S3 there is no concept of regions for containers (this is at # the storage account level in Azure) self.container_client.create_container() def _walk_blob_tree(self, obj, ignore=None): """ Walk a blob tree in a directory manner and return a list of directories and files. :param ItemPaged[BlobProperties] obj: Iterable response of BlobProperties obtained from ContainerClient.walk_blobs :param str|None ignore: An entry to be excluded from the returned list, typically the top level prefix :return: List of objects and directories in the tree :rtype: List[str] """ if obj.name != ignore: yield obj.name if isinstance(obj, BlobPrefix): # We are a prefix and not a leaf so iterate children for child in obj: for v in self._walk_blob_tree(child): yield v def list_bucket(self, prefix="", delimiter=DEFAULT_DELIMITER): """ List bucket content in a directory manner :param str prefix: :param str delimiter: :return: List of objects and dirs right under the prefix :rtype: List[str] """ res = self.container_client.walk_blobs(name_starts_with=prefix, delimiter=delimiter) return self._walk_blob_tree(res, ignore=prefix) def download_file(self, key, dest_path, decompress=None): """ Download a file from Azure Blob Storage :param str key: The key to download :param str dest_path: Where to put the destination file :param str|None decompress: Compression scheme to use for decompression """ obj = self.container_client.download_blob(key) with open(dest_path, "wb") as dest_file: if decompress is None: obj.download_to_stream(dest_file) return blob = StreamingBlobIO(obj) decompress_to_file(blob, dest_file, decompress) def remote_open(self, key, decompressor=None): """ Open a remote Azure Blob Storage object and return a readable stream :param str key: The key identifying the object to open :param barman.clients.cloud_compression.ChunkedCompressor decompressor: A ChunkedCompressor object which will be used to decompress chunks of bytes as they are read from the stream :return: A file-like object from which the stream can be read or None if the key does not exist """ try: obj = self.container_client.download_blob(key) resp = StreamingBlobIO(obj) if decompressor: return DecompressingStreamingIO(resp, decompressor) else: return resp except ResourceNotFoundError: return None def upload_fileobj( self, fileobj, key, override_tags=None, ): """ Synchronously upload the content of a file-like object to a cloud key :param fileobj IOBase: File-like object to upload :param str key: The key to identify the uploaded object :param List[tuple] override_tags: List of tags as k,v tuples to be added to the uploaded object """ # Find length of the file so we can pass it to the Azure client fileobj.seek(0, SEEK_END) length = fileobj.tell() fileobj.seek(0) extra_args = self._extra_upload_args.copy() tags = override_tags or self.tags if tags is not None: extra_args["tags"] = dict(tags) self.container_client.upload_blob(name=key, data=fileobj, overwrite=True, length=length, max_concurrency=self.max_concurrency, **extra_args) def create_multipart_upload(self, key): """No-op method because Azure has no concept of multipart uploads Instead of multipart upload, blob blocks are staged and then committed. However this does not require anything to be created up front. This method therefore does nothing. """ pass def _upload_part(self, upload_metadata, key, body, part_number): """ Upload a single block of this block blob. Uses the supplied part number to generate the block ID and returns it as the "PartNumber" in the part metadata. :param dict upload_metadata: Provider-specific metadata about the upload (not used in Azure) :param str key: The key to use in the cloud service :param object body: A stream-like object to upload :param int part_number: Part number, starting from 1 :return: The part metadata :rtype: dict[str, None|str] """ # Block IDs must be the same length for all bocks in the blob # and no greater than 64 characters. Given there is a limit of # 50000 blocks per blob we zero-pad the part_number to five # places. block_id = str(part_number).zfill(5) blob_client = self.container_client.get_blob_client(key) blob_client.stage_block(block_id, body, **self._extra_upload_args) return {"PartNumber": block_id} def _complete_multipart_upload(self, upload_metadata, key, parts): """ Finish a "multipart upload" by committing all blocks in the blob. :param dict upload_metadata: Provider-specific metadata about the upload (not used in Azure) :param str key: The key to use in the cloud service :param parts: The list of block IDs for the blocks which compose this blob """ blob_client = self.container_client.get_blob_client(key) block_list = [part["PartNumber"] for part in parts] extra_args = self._extra_upload_args.copy() if self.tags is not None: extra_args["tags"] = dict(self.tags) blob_client.commit_block_list(block_list, **extra_args) def _abort_multipart_upload(self, upload_metadata, key): """ Abort the upload of a block blob The objective of this method is to clean up any dangling resources - in this case those resources are uncommitted blocks. :param dict upload_metadata: Provider-specific metadata about the upload (not used in Azure) :param str key: The key to use in the cloud service """ # Ideally we would clean up uncommitted blocks at this point # however there is no way of doing that. # Uncommitted blocks will be discarded after 7 days or when # the blob is committed (if they're not included in the commit). # We therefore create an empty blob (thereby discarding all uploaded # blocks for that blob) and then delete it. blob_client = self.container_client.get_blob_client(key) blob_client.commit_block_list([], **self._extra_upload_args) blob_client.delete_blob() def delete_objects(self, paths): """ Delete the objects at the specified paths :param List[str] paths: """ try: # If paths is empty because the files have already been deleted then # delete_blobs will return successfully so we just call it with whatever # we were given responses = self.container_client.delete_blobs(*paths) except PartialBatchErrorException as exc: # Although the docs imply any errors will be returned in the response # object, in practice a PartialBatchErrorException is raised which contains # the response objects in its `parts` attribute. # We therefore set responses to reference the response in the exception and # treat it the same way we would a regular response. logging.warning( "PartialBatchErrorException received from Azure: %s" % exc.message) responses = exc.parts # resp is an iterator of HttpResponse objects so we check the status codes # which should all be 202 if successful errors = False for resp in responses: if resp.status_code == 404: logging.warning( "Deletion of object %s failed because it could not be found" % resp.request.url) elif resp.status_code != 202: errors = True logging.error( 'Deletion of object %s failed with error code: "%s"' % (resp.request.url, resp.status_code)) if errors: raise CloudProviderError( "Error from cloud provider while deleting objects - " "please check the Barman logs")
class AzureBlobStorageConnector: """Connector class that contains Azure's Container client, and works as an interface to read/write from/to files on AzureBlobStorage. """ def __init__(self, account_url: str, container_name: str, credential: Optional[any] = None) -> None: """ Setting up the `AzureBlobStorageConnector` by initializing a containerclient and credential. Parameters ---------- account_url: str Account url used in the `azure.storage.blob.ContainerClient`. container_name: str Name of the container used in the `azure.storage.blob.ContainerClient`. credential: Optional[any] Credential instance of some credential class provided by the `azure.identity` package. """ self.account_url = account_url self.container_name = container_name # Will use InteractiveBrowserCredential if all others fail self.credential = credential or DEFAULT_CREDENTIAL self.container_client = ContainerClient(self.account_url, self.container_name, credential=self.credential) def download_blob(self, blob_path: str, dest_path: str) -> None: """Downloads as blob from AzureBlobStorage to a file on your local filesystem. Parameters ---------- blob_path: str Filepath (relative to container root) of the file you want to download. dest_path: str (local) path to the file you want to store it. """ blob_client = self.container_client.get_blob_client(blob_path) with open(dest_path, "wb") as f: blob_client.download_blob().readinto(f) @contextmanager def download_stream(self, blob_path: str) -> BytesIO: """Downloads a blob as BytesIO stream. This stream can then be used to load a DataFrame directly into (pandas) memory, without storing locally first. Usage: with connector.download_stream('path/to/file.ext') as stream: df = pd.read_parquet(stream) This way the stream gets neatly created and closed afterwards. Parameters ---------- blob_path: str filepath (relative to container root) from the file to download. Yields ------ : BytesIO Stream which allows to load directly into memory. """ stream = BytesIO() blob_client = self.container_client.get_blob_client(blob_path) blob_client.download_blob().readinto(stream) stream.seek(0) # set pointer back to start of stream yield stream logger.debug('Closing stream...') stream.close() def upload_blob(self, source_path: str, blob_path: str, overwrite: bool = False) -> None: """Uploads a local file to Azure Blob Storage Parameters ---------- source_path: str (local) path to the file you want to upload. blob_path: str Filepath (relative to container root) of the file to be created. """ blob_client = self.container_client.get_blob_client(blob_path) with open(source_path, 'rb') as f: blob_client.upload_blob(f, blob_type="BlockBlob", overwrite=overwrite) @contextmanager def upload_stream(self, blob_path: str, overwrite: bool = False) -> BytesIO: """Opens a byte-stream to write to using a context manager. The stream will write to a file on Azure Blob Storage specified by the `blob_path`. Usage: with connector.upload_stream('path/to/file.ext') as stream: df.to_parquet(stream) This way the stream gets neatly created and closed afterwards. Parameters ---------- blob_path: str Filepath (relative to conainer root) of the file to be created. overwrite: bool (optional) Wheter or not to overwrite an existing file. If set to False and the blob already exists, will raise a ResourceExistsError. Defaults to False Yields ------ : BytesIO Stream which allows to load directly into memory. """ stream = BytesIO() yield stream stream.seek(0) # set pointer back to start of stream blob_client = self.container_client.get_blob_client(blob_path) blob_client.upload_blob(stream, blob_type="BlockBlob", overwrite=overwrite) logger.debug(f'Uploaded stream to {blob_path}. Closing stream...') stream.close() @contextmanager def open(self, blob_path: str, mode: str) -> Generator[BytesIO, None, None]: """Create a Python-like `open()` interface for downloading and uploading files from and to AzureBlobStorage. Usage: with connector.open('path/to/file.ext', 'r') as stream: df = pd.read_parquet(stream) Parameters ---------- blob_path: str Filepath (relative to container root) of the file to be created. mode: str Like Python's `open()` function, the mode you want to open the file. Character Meaning --------- ---------------------------------------------------------- 'r' open for reading, or downloading the stream. 'w' open for writing, or uploading the stream. 'o' open for overwriting, or delete file, and upload stream. """ if mode.lower() == 'r': with self.download_stream(blob_path) as s: yield s elif mode.lower() == 'w': with self.upload_stream(blob_path, overwrite=False) as s: yield s elif mode.lower() == 'o': with self.upload_stream(blob_path, overwrite=True) as s: yield s else: raise ValueError(f'Unknown mode: {mode}.')
.config(f"fs.azure.sas.{workspace_container}.hecdf.blob.core.windows.net", workspace_sas_token) \ .getOrCreate() ## Define your blob services to access files on Azure Blob Storage from azure.storage.blob import ContainerClient testname = "koalas-tutorial/datasets/loan_preprocessed.csv" account_url = "https://hecdf.blob.core.windows.net" facts_blob_service = ContainerClient(account_url=account_url, container_name=facts_container, credential=facts_sas_token) workspace_blob_service = ContainerClient(account_url=account_url, container_name=workspace_container, credential=workspace_sas_token) # Create the parent folder blobs = list(facts_blob_service.list_blobs()) for blob in blobs: from pathlib import Path Path(f'../../data/raw/{blob.name}').parent.mkdir(parents=True, exist_ok=True) # From facts to your home directory with open(f"../../data/raw/{blob.name}", "wb") as data: download_stream = facts_blob_service.get_blob_client( blob.name).download_blob() data.write(download_stream.readall()) spark.stop()