def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') # Get HTTP parameters from URL, setting defaults if empty folder = req.params.get('folder') # Lookup the folder and return a list of blobs container = ContainerClient.from_connection_string(conn_str=os.environ["AzureWebJobsStorage"], container_name="photos") blob_list = list(container.list_blobs(name_starts_with=folder)) if len(blob_list) == 0: return func.HttpResponse( status_code=404 ) # Create our response obejct photo_response = { "files": [] } # Iterate the list to populate it for blob in blob_list: photo_response["files"].append(blob.name) return func.HttpResponse( json.dumps(photo_response), status_code=200, mimetype="application/json" )
def __init__(self, storage_account, storage_container, key, protocol='https', endpoint_suffix='core.windows.net'): self.AZURE_STORAGE_ACCOUNT = storage_account self.AZURE_STORAGE_CONTAINER = storage_container self.PROTOCOL = protocol self.ENDPOINT_SUFFIX = endpoint_suffix self.AZURE_STORAGE_KEY = key self.AZURE_STORAGE_CONNECTION_STRING = 'DefaultEndpointsProtocol={0};AccountName={1};AccountKey={2};EndpointSuffix={3}'.format( self.PROTOCOL, self.AZURE_STORAGE_ACCOUNT, self.AZURE_STORAGE_KEY, self.ENDPOINT_SUFFIX) self.container_url = _make_url( f'https://{self.AZURE_STORAGE_ACCOUNT}.blob.core.windows.net', self.AZURE_STORAGE_CONTAINER) self.container_client = ContainerClient.from_connection_string( self.AZURE_STORAGE_CONNECTION_STRING, self.AZURE_STORAGE_CONTAINER) self.blob_service_client = BlobServiceClient.from_connection_string( self.AZURE_STORAGE_CONNECTION_STRING) self.blob_client = None
def update_status_svg(self, spider, svg): from azure.storage.blob import ContainerClient, ContentSettings container_client = ContainerClient( "{}.blob.core.windows.net".format( self.crawler.settings.get("AZURE_ACCOUNT_NAME")), self.crawler.settings.get("CITY_SCRAPERS_STATUS_CONTAINER"), credential=self.crawler.settings.get("AZURE_ACCOUNT_KEY"), ) container_client.upload_blob( "{}.svg".format(spider.name), svg, content_settings=ContentSettings(content_type="image/svg+xml", cache_control="no-cache"), overwrite=True, )
def main(args) -> None: """Entry point. Args: args: CLI arguments. """ cc = ContainerClient.from_connection_string( AZ_CONN_STR.format(key=args.key), AZ_CONTAINER) cache_files = [b.name for b in cc.list_blobs()] for cache_file in Path(args.input).iterdir(): m = re.match(r"([a-z]+)-([a-f0-9]+)\.zip", cache_file.name) if not m: log.info(f"Skipping {cache_file} (not a cache file)") continue docset = m.group(1) if args.only and docset not in args.only: continue if not args.force and cache_file.name in cache_files: log.info(f"Skipping upload of {cache_file.name} (already exists)") continue with open(cache_file, "rb") as f: log.info(f"Uploading {cache_file.name}...") bc = cc.get_blob_client(cache_file.name) bc.upload_blob(f, overwrite=True)
def __init__(self, blob_account_url, container_name, credential=None, **kwargs): # type(str, str, Optional[Any], Any) -> None container_client = kwargs.pop('container_client', None) self._container_client = container_client or ContainerClient( blob_account_url, container_name, credential=credential, **kwargs ) self._cached_blob_clients = defaultdict() # type: Dict[str, BlobClient]
def __init__(self, uri): from azure.storage.blob import ContainerClient container = uri.split("@")[1].split("/")[0] filename = "/".join(uri.split("@")[1].split("/")[1::]) account_name, account_key = uri[8::].split("@")[0].split(":") self.account_name = account_name self.account_key = account_key self.container = container self.filename = filename self.container_client = ContainerClient( "{}.blob.core.windows.net".format(self.account_name), self.container, credential=self.account_key, )
def download_blobs_as_one_json(dateDir, outputDir): # generate json viaDict = [] try: with open('../packages/aerialnet/aerialnet/data/AZURE_STORAGE' ) as version_file: AZURE_STORAGE_CONNECTION_STRING = version_file.read() CONTAINER_NAME = "aihistory" container = ContainerClient.from_connection_string( AZURE_STORAGE_CONNECTION_STRING, container_name=CONTAINER_NAME) blob_list = container.list_blobs(name_starts_with=dateDir + '/') for idx, blob in enumerate(blob_list): print('Downloading blob #{}: {}\n'.format(idx + 1, blob.name), flush=True) if '.json' in blob.name: blob_client = container.get_blob_client(blob.name) download_stream = blob_client.download_blob() jsonContent = json.loads(download_stream.readall()) viaDict.append(jsonContent) print('Total blobs downloaded: {}'.format(idx + 1)) with open(os.path.join(outputDir, 'viaJsonFile_ORIGINAL.json'), 'w') as f: json.dump(viaDict, f) except Exception as ex: print('Exception:') print(ex)
def container_sample(self): # [START create_container_client_from_service] # Instantiate a BlobServiceClient using a connection string from azure.storage.blob import BlobServiceClient blob_service_client = BlobServiceClient.from_connection_string(self.connection_string) # Instantiate a ContainerClient container_client = blob_service_client.get_container_client("mynewcontainer") # [END create_container_client_from_service] # [START create_container_client_sasurl] from azure.storage.blob import ContainerClient sas_url = "https://account.blob.core.windows.net/mycontainer?sv=2015-04-05&st=2015-04-29T22%3A18%3A26Z&se=2015-04-30T02%3A23%3A26Z&sr=b&sp=rw&sip=168.1.5.60-168.1.5.70&spr=https&sig=Z%2FRHIX5Xcg0Mq2rqI3OlWTjEg2tYkboXr1P9ZUXDtkk%3D" container = ContainerClient.from_container_url(sas_url) # [END create_container_client_sasurl] try: # [START create_container] container_client.create_container() # [END create_container] # [START get_container_properties] properties = container_client.get_container_properties() # [END get_container_properties] finally: # [START delete_container] container_client.delete_container()
def test_cache_correctness(self): with self._setup_test() as az_info: for suffix in ('.jsonl.gz', '.msgpack.l.gz'): random_elements = list(range(100)) remote_path = RichPath.create("azure://devstoreaccount1/test1/compressed/data" + suffix, az_info) remote_path.save_as_compressed_file(random_elements) # Read once read_nums = list(remote_path.read_by_file_suffix()) self.assertListEqual(read_nums, random_elements) # Hit Cache read_nums = list(remote_path.read_by_file_suffix()) self.assertListEqual(read_nums, random_elements) self.assertTrue(remote_path.exists()) self.assertTrue(remote_path.is_file()) # Update file through other means, and ensure that cache is appropriately invalidated. new_elements = list(range(500)) with TemporaryDirectory() as tmp: path = os.path.join(tmp, 'tst'+suffix) if suffix == '.jsonl.gz': save_jsonl_gz(new_elements, path) else: save_msgpack_l_gz(new_elements, path) container_client = ContainerClient.from_connection_string(self.AZURITE_DEVELOPMENT_CONNECTION_STRING, "test1") blob_client = container_client.get_blob_client("compressed/data" + suffix) with open(path, 'rb') as f: blob_client.upload_blob(f, overwrite=True) read_nums = list(remote_path.read_by_file_suffix()) self.assertListEqual(read_nums, new_elements) self.assertTrue(remote_path.exists()) self.assertTrue(remote_path.is_file())
def _create_test_container(self): client: ContainerClient = ContainerClient.from_connection_string( self.AZURITE_DEVELOPMENT_CONNECTION_STRING, container_name="test1") try: client.create_container() except ResourceExistsError: pass
def container_access_policy(self): # SAS URL is calculated from storage key, so this test runs live only if TestMode.need_recording_file(self.test_mode): return # Instantiate a BlobServiceClient using a connection string from azure.storage.blob import BlobServiceClient blob_service_client = BlobServiceClient.from_connection_string( self.connection_string) # Instantiate a ContainerClient container_client = blob_service_client.get_container_client( "myaccesscontainer") try: # Create new Container container_client.create_container() # [START set_container_access_policy] # Create access policy from azure.storage.blob import AccessPolicy, ContainerSasPermissions access_policy = AccessPolicy( permission=ContainerSasPermissions(read=True), expiry=datetime.utcnow() + timedelta(hours=1), start=datetime.utcnow() - timedelta(minutes=1)) identifiers = {'test': access_policy} # Set the access policy on the container container_client.set_container_access_policy( signed_identifiers=identifiers) # [END set_container_access_policy] # [START get_container_access_policy] policy = container_client.get_container_access_policy() # [END get_container_access_policy] # [START generate_sas_token] # Use access policy to generate a sas token from azure.storage.blob import generate_container_sas sas_token = generate_container_sas( container_client.account_name, container_client.container_name, account_key=container_client.credential.account_key, policy_id='my-access-policy-id') # [END generate_sas_token] # Use the sas token to authenticate a new client # [START create_container_client_sastoken] from azure.storage.blob import ContainerClient container = ContainerClient.from_container_url( container_url= "https://account.blob.core.windows.net/mycontainer", credential=sas_token) # [END create_container_client_sastoken] finally: # Delete container container_client.delete_container()
def get_storage_client(datasets_table: Mapping[str, Any], dataset_name: str) -> ContainerClient: """Gets a ContainerClient for the Azure Blob Storage Container corresponding to the given dataset. Adds 'container_client' key to datasets_table (in-place update) if a new ContainerClient is created for the dataset. Args: datasets_table: dict, the return value of get_datasets_table() dataset_name: str, key in datasets_table Returns: azure.storage.blob.ContainerClient, corresponds to the requested dataset """ if dataset_name not in datasets_table: raise KeyError(f'Dataset {dataset_name} is not in datasets table.') entry = datasets_table[dataset_name] if 'storage_container_client' not in entry: # create a new storage container client for this dataset, # and cache it if 'container_sas_key' not in entry: raise KeyError( f'Dataset {dataset_name} does not have the ' 'container_sas_key field in the datasets table.') entry['storage_container_client'] = ContainerClient( account_url=f'{entry["storage_account"]}.blob.core.windows.net', container_name=entry['container'], credential=entry['container_sas_key']) return entry['storage_container_client']
def list_blob_in_container(connection_s: str, container_n: str) -> list: """ list the blobs within a given container of an Azure storage account Helper function for debugging in case no access to azure Arguments: connection_s {str} -- an azure storage account connection string container_n {str} -- a container within a storage account Returns: blob_names_list -- the list of blobs within container """ try: campaign_container = ContainerClient.from_connection_string( conn_str=connection_s, container_name=container_n) blob_list = campaign_container.list_blobs() blob_names_list = [] for blob in blob_list: blob_names_list.append(blob.name) return blob_names_list except: logger.info( "The container you are trying to list blob from probably does not exist." ) logger.info( "Early exit of ETL process as container probably does not exist.") exit()
def get_newest_file(container_name, substring): newest_filename = "" i = 0 try: with ContainerClient.from_connection_string( storageConnectionString, container_name) as container_client: blob_list = container_client.list_blobs() for filename in blob_list: if substring in filename.name: i += 1 try: timestamp = datetime.datetime.strptime( filename.name[len(substring):], '%Y-%m-%d_%H-%M') except: continue if i == 1: newest_timestamp = timestamp newest_filename = filename.name else: if (newest_timestamp < timestamp): newest_timestamp = timestamp newest_filename = filename.name except Exception as ex: i = 0 print(ex) return newest_filename, i > 0
def __init__(self, connection_string: str, storage_name: str) -> None: self.__client = ContainerClient.from_connection_string(conn_str=connection_string, container_name=storage_name) self.__remote_files_cache: Optional[List[dict]] = None super().__init__( remote_root_dir=Path(""), local_root_dir=Path(DEFAULT_ROOT_DIR, storage_name), )
def test_sas_signature_is_scrubbed_off(self, storage_account_name, storage_account_key): # SAS URL is calculated from storage key, so this test runs live only bsc = BlobServiceClient(self.account_url(storage_account_name, "blob"), storage_account_key) self._setup(bsc) # Arrange container = bsc.get_container_client(self.container_name) token = generate_container_sas( container.account_name, container.container_name, account_key=container.credential.account_key, permission=ContainerSasPermissions(read=True), expiry=datetime.utcnow() + timedelta(hours=1), ) # parse out the signed signature token_components = parse_qs(token) signed_signature = quote( token_components[QueryStringConstants.SIGNED_SIGNATURE][0]) sas_service = ContainerClient.from_container_url(container.url, credential=token) # Act with LogCaptured(self) as log_captured: sas_service.get_account_information(logging_enable=True) log_as_str = log_captured.getvalue() # Assert # make sure the query parameter 'sig' is logged, but its value is not self.assertTrue( QueryStringConstants.SIGNED_SIGNATURE in log_as_str) self.assertFalse(signed_signature in log_as_str)
def upload_json(self,rawdata,fname): CONNECT_STR = "DefaultEndpointsProtocol=https;AccountName=stosblobv2;AccountKey=4lcPBLS0bAypEaU1QFGd4QadH5WzvyL3vy3IS+gNhrij4I1dPaXcu9ATl+XdrctTQlH8/oG3qKpdy19FYg6WEg==;EndpointSuffix=core.windows.net" CONTAINER_NAME = "test" # Instantiate a ContainerClient. This is used when uploading a blob from your local file. container_client = ContainerClient.from_connection_string( conn_str=CONNECT_STR, container_name=CONTAINER_NAME ) data = rawdata output_blob_name = fname #This is an optional setting for guaranteeing the MIME type to be always json. content_setting = ContentSettings( content_type='application/json', content_encoding=None, content_language=None, content_disposition=None, cache_control=None, content_md5=None ) # Upload file container_client.upload_blob( name=output_blob_name, data=data, content_settings=content_setting) # Check the result all_blobs = container_client.list_blobs(name_starts_with="BLOB", include=None) for each in all_blobs: print("RES: ", each)
def __init__( self, container=None, prefix='', account_name=None, account_key=None, blob_service_kwargs=None, dimension_separator=None, client=None, ): self._dimension_separator = dimension_separator self.prefix = normalize_storage_path(prefix) if client is None: # deprecated option, try to construct the client for them msg = ( "Providing 'container', 'account_name', 'account_key', and 'blob_service_kwargs'" "is deprecated. Provide and instance of 'azure.storage.blob.ContainerClient' " "'client' instead.") warnings.warn(msg, FutureWarning, stacklevel=2) from azure.storage.blob import ContainerClient blob_service_kwargs = blob_service_kwargs or {} client = ContainerClient( "https://{}.blob.core.windows.net/".format(account_name), container, credential=account_key, **blob_service_kwargs) self.client = client self._container = container self._account_name = account_name self._account_key = account_key
def copy_output(step_id, env): account_url = f'https://{env.scoring_datastore_storage_name}.blob.core.windows.net' src_blob_name = f'azureml/{step_id}/{env.scoring_datastore_storage_name}_out/parallel_run_step.txt' src_blob_url = f'{account_url}/{env.scoring_datastore_output_container}/{src_blob_name}' container_client = ContainerClient(account_url=account_url, container_name=env.scoring_datastore_output_container, credential=env.scoring_datastore_access_key) src_blob_properties = container_client.get_blob_client(src_blob_name).get_blob_properties() destfolder = src_blob_properties.last_modified.date().isoformat() file_time = (src_blob_properties.last_modified.time()).isoformat('milliseconds').replace(':','_').replace('.','_') filename_parts = env.scoring_datastore_output_filename.split('.') dest_blob_name = f'{destfolder}/{filename_parts[0]}_{file_time}.{filename_parts[1]}' dest_client = container_client.get_blob_client(dest_blob_name) dest_client.start_copy_from_url(src_blob_url)
def download_file(save_path, cloud_file_name, container_name): blobnames = [] try: with ContainerClient.from_connection_string( storageConnectionString, container_name) as container_client: # First check if the file actually exists blob_list = container_client.list_blobs() for blob in blob_list: if (blob.name == cloud_file_name): blobnames.append(blob.name) # Download files to specified download folder if len(blobnames) == 1: for filename in blobnames: with container_client.get_blob_client( filename) as blob_client: with open(os.path.join(save_path, filename), "wb") as file_path: file_path.write( blob_client.download_blob().readall()) print("Downloaded file: " + str(cloud_file_name)) return os.path.join(save_path, filename), True else: print("Could not find requested blob ", str(cloud_file_name), " in the following list:") for blob in blob_list: print(blob.name) return " ", False except Exception as ex: print('Azure Blob Storage Exception:') print(ex) return " ", False
def upload_blob( container: ContainerClient, blob_name: str, content_type: str, content_encoding: str, data: Any, return_sas_token: bool = True, ) -> str: """ Uploads the given data to a blob record. If a blob with the given name already exist, it throws an error. Returns a uri with a SAS token to access the newly created blob. """ create_container_using_client(container) logger.info(f"Uploading blob '{blob_name}'" + f"to container '{container.container_name}'" + f"on account: '{container.account_name}'") content_settings = ContentSettings(content_type=content_type, content_encoding=content_encoding) blob = container.get_blob_client(blob_name) blob.upload_blob(data, content_settings=content_settings) logger.debug(f" - blob '{blob_name}' uploaded. generating sas token.") if return_sas_token: uri = get_blob_uri_with_sas_token(blob) else: uri = remove_sas_token(blob.url) logger.debug(f" - blob access url: '{uri}'.") return uri
def ProcessAllImages(): print('Processing Images.') container = ContainerClient.from_connection_string( connection_string, container_name=container_name_images) blobs_list = container.list_blobs() token = GetSASToken() for blob in blobs_list: blob_client = container.get_blob_client(blob.name) # Generate filename for image metadata file. fileName = GetFilePathFromImageURL(blob_client.url) nycImageUrl = f'{blob_client.url}?{token}' try: jsonImageMetadata = GetNYCImageMetadata(nycImageUrl) SaveImageMetadata(jsonImageMetadata, fileName) print(f'Completed processing {fileName}.') except Exception as e: print(f"Error {e} - {fileName}")
def __init__(self, crawler, output_format): from azure.storage.blob import ContainerClient feed_uri = crawler.settings.get("FEED_URI") account_name, account_key = feed_uri[8::].split("@")[0].split(":") self.spider = crawler.spider self.container = feed_uri.split("@")[1].split("/")[0] self.container_client = ContainerClient( "{}.blob.core.windows.net".format(account_name), self.container, credential=account_key, ) self.feed_prefix = crawler.settings.get( "CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d" ) super().__init__(crawler, output_format)
def __init__( self, blob_storage_conn_str, container_base, container_processed, container_quarantined, ): """Creates a Blob Storage object ... Parameters ------- blob_storage_conn_str: str Connection string with Blob Storage. container_base: str Container where files will be read. container_processed: str Container where files will be sent after being processed (Processed with sucess). container_quarantined: str Container that stores quarantine files (Processed with failure). """ self.blob_storage_conn_str = blob_storage_conn_str self.container_base = container_base self.container_processed = container_processed self.container_quarantined = container_quarantined # init blob service & container connectivity # instantiate the blob storage class to perform operations on it self.blob_service_client = BlobServiceClient.from_connection_string( conn_str=self.blob_storage_conn_str) self.get_container_base_info = ContainerClient.from_connection_string( conn_str=self.blob_storage_conn_str, container_name=self.container_base) log.info( f"successfully established connection with container base: {self.container_base}" ) print( f"successfully established connection with container base: {self.container_base}" ) # get sku of the blob storage account account_info = self.get_container_base_info.get_account_information() log.info("storage sku: {}".format(account_info["sku_name"].lower())) # get stats of blob storage & container service info stats_blob_storage = self.blob_service_client.get_service_stats() log.info("blob storage replication status: {}".format( stats_blob_storage["geo_replication"]["status"])) log.info("last blob storage sync replication time: {}".format( self.utc_to_local( stats_blob_storage["geo_replication"]["last_sync_time"]))) stats_container_base = self.get_container_base_info.get_container_properties( ) log.info("last container modified time: {}".format( self.utc_to_local(stats_container_base.last_modified)))
def find_azure_storage_blob_file_names(conn_str, container_name, prefix=''): """ Fetched all the files in the bucket which are returned in a list as Google Blob objects """ container = ContainerClient.from_connection_string( conn_str=conn_str, container_name=container_name) return list(container.list_blobs(prefix=prefix))
def getBlobUrl(imagename, connectionString): try: container_client = ContainerClient.from_connection_string( conn_str=connectionString, container_name="droneimages") blob_client = container_client.get_blob_client(imagename) return blob_client.url except: xlog('getBlobUrl: error:', sys.exc_info()[0])
def __init__(self, account_name, account_key, container_name, source, *args, **kwargs): super(ConnLocal2AzureOperator, self).__init__(*args, **kwargs) self.client = ContainerClient( account_url=f"https://{account_name}.blob.core.windows.net/", credential=account_key, container_name=container_name) self.source = source
def bucket_exists(self): container = ContainerClient.from_connection_string(self._account, self._bucket, connection_timeout=300) try: container.get_container_properties() log.debug(output_messages['DEBUG_CONTAINER_ALREADY_EXISTS'] % self._bucket, class_name=AZURE_STORAGE_NAME) return True except Exception: return False
class AzureDiffPipeline(DiffPipeline): """Azure Blob Storage backend for comparing previously scraped JSCalendar outputs""" def __init__(self, crawler, output_format): from azure.storage.blob import ContainerClient feed_uri = crawler.settings.get("FEED_URI") account_name, account_key = feed_uri[8::].split("@")[0].split(":") self.spider = crawler.spider self.container = feed_uri.split("@")[1].split("/")[0] self.container_client = ContainerClient( "{}.blob.core.windows.net".format(account_name), self.container, credential=account_key, ) self.feed_prefix = crawler.settings.get( "CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d" ) super().__init__(crawler, output_format) def load_previous_results(self): max_days_previous = 3 days_previous = 0 tz = timezone(self.spider.timezone) while days_previous <= max_days_previous: matching_blobs = self.container_client.list_blobs( name_starts_with=( tz.localize(datetime.now()) - timedelta(days=days_previous) ).strftime(self.feed_prefix) ) spider_blobs = [ blob for blob in matching_blobs if "{}.".format(self.spider.name) in blob.name ] if len(spider_blobs) > 0: break days_previous += 1 if len(spider_blobs) == 0: return [] blob = sorted(spider_blobs, key=attrgetter("name"))[-1] feed_blob = self.container_client.get_blob_client(blob.name) feed_text = feed_blob.download_blob().content_as_text() return [json.loads(line) for line in feed_text.split("\n") if line.strip()]
def test_translation_with_glossary(self, client): doc = Document(data=b'testing') source_container_sas_url = self.create_source_container(data=[doc]) target_container_sas_url = self.create_target_container() container_client = ContainerClient(self.storage_endpoint, self.source_container_name, self.storage_key) with open(GLOSSARY_FILE_NAME, "rb") as fd: container_client.upload_blob(name=GLOSSARY_FILE_NAME, data=fd.read()) prefix, suffix = source_container_sas_url.split("?") glossary_file_sas_url = prefix + "/" + GLOSSARY_FILE_NAME + "?" + suffix poller = client.begin_translation( source_container_sas_url, target_container_sas_url, "es", glossaries=[TranslationGlossary(glossary_url=glossary_file_sas_url, file_format="csv")] ) result = poller.result() container_client = ContainerClient(self.storage_endpoint, self.target_container_name, self.storage_key) # download translated file and assert that translation reflects glossary changes document = doc.name + doc.suffix with open(document, "wb") as my_blob: download_stream = container_client.download_blob(document) my_blob.write(download_stream.readall()) with open(document, "rb") as fd: translated = fd.readline() assert b'essai' in translated # glossary worked os.remove(document)