def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')
    
    # Get HTTP parameters from URL, setting defaults if empty
    folder = req.params.get('folder')

    # Lookup the folder and return a list of blobs
    container = ContainerClient.from_connection_string(conn_str=os.environ["AzureWebJobsStorage"], container_name="photos")
    blob_list = list(container.list_blobs(name_starts_with=folder))

    if len(blob_list) == 0:
        return func.HttpResponse(
            status_code=404
        )

    # Create our response obejct
    photo_response = {
        "files": []
    }

    # Iterate the list to populate it
    for blob in blob_list:
        photo_response["files"].append(blob.name)
        
    
    return func.HttpResponse(
        json.dumps(photo_response),
        status_code=200,
        mimetype="application/json"
    )
Esempio n. 2
0
    def __init__(self,
                 storage_account,
                 storage_container,
                 key,
                 protocol='https',
                 endpoint_suffix='core.windows.net'):
        self.AZURE_STORAGE_ACCOUNT = storage_account
        self.AZURE_STORAGE_CONTAINER = storage_container
        self.PROTOCOL = protocol
        self.ENDPOINT_SUFFIX = endpoint_suffix

        self.AZURE_STORAGE_KEY = key
        self.AZURE_STORAGE_CONNECTION_STRING = 'DefaultEndpointsProtocol={0};AccountName={1};AccountKey={2};EndpointSuffix={3}'.format(
            self.PROTOCOL, self.AZURE_STORAGE_ACCOUNT, self.AZURE_STORAGE_KEY,
            self.ENDPOINT_SUFFIX)

        self.container_url = _make_url(
            f'https://{self.AZURE_STORAGE_ACCOUNT}.blob.core.windows.net',
            self.AZURE_STORAGE_CONTAINER)

        self.container_client = ContainerClient.from_connection_string(
            self.AZURE_STORAGE_CONNECTION_STRING, self.AZURE_STORAGE_CONTAINER)

        self.blob_service_client = BlobServiceClient.from_connection_string(
            self.AZURE_STORAGE_CONNECTION_STRING)
        self.blob_client = None
Esempio n. 3
0
    def update_status_svg(self, spider, svg):
        from azure.storage.blob import ContainerClient, ContentSettings

        container_client = ContainerClient(
            "{}.blob.core.windows.net".format(
                self.crawler.settings.get("AZURE_ACCOUNT_NAME")),
            self.crawler.settings.get("CITY_SCRAPERS_STATUS_CONTAINER"),
            credential=self.crawler.settings.get("AZURE_ACCOUNT_KEY"),
        )
        container_client.upload_blob(
            "{}.svg".format(spider.name),
            svg,
            content_settings=ContentSettings(content_type="image/svg+xml",
                                             cache_control="no-cache"),
            overwrite=True,
        )
Esempio n. 4
0
def main(args) -> None:
    """Entry point.

    Args:
        args: CLI arguments.
    """

    cc = ContainerClient.from_connection_string(
        AZ_CONN_STR.format(key=args.key), AZ_CONTAINER)

    cache_files = [b.name for b in cc.list_blobs()]
    for cache_file in Path(args.input).iterdir():
        m = re.match(r"([a-z]+)-([a-f0-9]+)\.zip", cache_file.name)
        if not m:
            log.info(f"Skipping {cache_file} (not a cache file)")
            continue

        docset = m.group(1)
        if args.only and docset not in args.only:
            continue

        if not args.force and cache_file.name in cache_files:
            log.info(f"Skipping upload of {cache_file.name} (already exists)")
            continue

        with open(cache_file, "rb") as f:
            log.info(f"Uploading {cache_file.name}...")
            bc = cc.get_blob_client(cache_file.name)
            bc.upload_blob(f, overwrite=True)
 def __init__(self, blob_account_url, container_name, credential=None, **kwargs):
     # type(str, str, Optional[Any], Any) -> None
     container_client = kwargs.pop('container_client', None)
     self._container_client = container_client or ContainerClient(
         blob_account_url, container_name, credential=credential, **kwargs
     )
     self._cached_blob_clients = defaultdict()  # type: Dict[str, BlobClient]
    def __init__(self, uri):
        from azure.storage.blob import ContainerClient

        container = uri.split("@")[1].split("/")[0]
        filename = "/".join(uri.split("@")[1].split("/")[1::])
        account_name, account_key = uri[8::].split("@")[0].split(":")

        self.account_name = account_name
        self.account_key = account_key
        self.container = container
        self.filename = filename
        self.container_client = ContainerClient(
            "{}.blob.core.windows.net".format(self.account_name),
            self.container,
            credential=self.account_key,
        )
Esempio n. 7
0
def download_blobs_as_one_json(dateDir, outputDir):
    # generate json
    viaDict = []
    try:
        with open('../packages/aerialnet/aerialnet/data/AZURE_STORAGE'
                  ) as version_file:
            AZURE_STORAGE_CONNECTION_STRING = version_file.read()
            CONTAINER_NAME = "aihistory"

        container = ContainerClient.from_connection_string(
            AZURE_STORAGE_CONNECTION_STRING, container_name=CONTAINER_NAME)

        blob_list = container.list_blobs(name_starts_with=dateDir + '/')

        for idx, blob in enumerate(blob_list):
            print('Downloading blob #{}: {}\n'.format(idx + 1, blob.name),
                  flush=True)

            if '.json' in blob.name:
                blob_client = container.get_blob_client(blob.name)
                download_stream = blob_client.download_blob()
                jsonContent = json.loads(download_stream.readall())
                viaDict.append(jsonContent)

        print('Total blobs downloaded: {}'.format(idx + 1))
        with open(os.path.join(outputDir, 'viaJsonFile_ORIGINAL.json'),
                  'w') as f:
            json.dump(viaDict, f)
    except Exception as ex:
        print('Exception:')
        print(ex)
Esempio n. 8
0
    def container_sample(self):

        # [START create_container_client_from_service]
        # Instantiate a BlobServiceClient using a connection string
        from azure.storage.blob import BlobServiceClient
        blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)

        # Instantiate a ContainerClient
        container_client = blob_service_client.get_container_client("mynewcontainer")
        # [END create_container_client_from_service]

        # [START create_container_client_sasurl]
        from azure.storage.blob import ContainerClient

        sas_url = "https://account.blob.core.windows.net/mycontainer?sv=2015-04-05&st=2015-04-29T22%3A18%3A26Z&se=2015-04-30T02%3A23%3A26Z&sr=b&sp=rw&sip=168.1.5.60-168.1.5.70&spr=https&sig=Z%2FRHIX5Xcg0Mq2rqI3OlWTjEg2tYkboXr1P9ZUXDtkk%3D"
        container = ContainerClient.from_container_url(sas_url)
        # [END create_container_client_sasurl]

        try:
            # [START create_container]
            container_client.create_container()
            # [END create_container]

            # [START get_container_properties]
            properties = container_client.get_container_properties()
            # [END get_container_properties]

        finally:
            # [START delete_container]
            container_client.delete_container()
Esempio n. 9
0
    def test_cache_correctness(self):
        with self._setup_test() as az_info:
            for suffix in ('.jsonl.gz', '.msgpack.l.gz'):
                random_elements = list(range(100))
                remote_path = RichPath.create("azure://devstoreaccount1/test1/compressed/data" + suffix, az_info)
                remote_path.save_as_compressed_file(random_elements)

                # Read once
                read_nums = list(remote_path.read_by_file_suffix())
                self.assertListEqual(read_nums, random_elements)

                # Hit Cache
                read_nums = list(remote_path.read_by_file_suffix())
                self.assertListEqual(read_nums, random_elements)
                self.assertTrue(remote_path.exists())
                self.assertTrue(remote_path.is_file())

                # Update file through other means, and ensure that cache is appropriately invalidated.
                new_elements = list(range(500))
                with TemporaryDirectory() as tmp:
                    path = os.path.join(tmp, 'tst'+suffix)
                    if suffix == '.jsonl.gz':
                        save_jsonl_gz(new_elements, path)
                    else:
                        save_msgpack_l_gz(new_elements, path)
                    container_client = ContainerClient.from_connection_string(self.AZURITE_DEVELOPMENT_CONNECTION_STRING,
                                                                              "test1")
                    blob_client = container_client.get_blob_client("compressed/data" + suffix)
                    with open(path, 'rb') as f:
                        blob_client.upload_blob(f, overwrite=True)

                read_nums = list(remote_path.read_by_file_suffix())
                self.assertListEqual(read_nums, new_elements)
                self.assertTrue(remote_path.exists())
                self.assertTrue(remote_path.is_file())
Esempio n. 10
0
 def _create_test_container(self):
     client: ContainerClient = ContainerClient.from_connection_string(
         self.AZURITE_DEVELOPMENT_CONNECTION_STRING, container_name="test1")
     try:
         client.create_container()
     except ResourceExistsError:
         pass
Esempio n. 11
0
    def container_access_policy(self):
        # SAS URL is calculated from storage key, so this test runs live only
        if TestMode.need_recording_file(self.test_mode):
            return

        # Instantiate a BlobServiceClient using a connection string
        from azure.storage.blob import BlobServiceClient
        blob_service_client = BlobServiceClient.from_connection_string(
            self.connection_string)

        # Instantiate a ContainerClient
        container_client = blob_service_client.get_container_client(
            "myaccesscontainer")

        try:
            # Create new Container
            container_client.create_container()

            # [START set_container_access_policy]
            # Create access policy
            from azure.storage.blob import AccessPolicy, ContainerSasPermissions
            access_policy = AccessPolicy(
                permission=ContainerSasPermissions(read=True),
                expiry=datetime.utcnow() + timedelta(hours=1),
                start=datetime.utcnow() - timedelta(minutes=1))

            identifiers = {'test': access_policy}

            # Set the access policy on the container
            container_client.set_container_access_policy(
                signed_identifiers=identifiers)
            # [END set_container_access_policy]

            # [START get_container_access_policy]
            policy = container_client.get_container_access_policy()
            # [END get_container_access_policy]

            # [START generate_sas_token]
            # Use access policy to generate a sas token
            from azure.storage.blob import generate_container_sas

            sas_token = generate_container_sas(
                container_client.account_name,
                container_client.container_name,
                account_key=container_client.credential.account_key,
                policy_id='my-access-policy-id')
            # [END generate_sas_token]

            # Use the sas token to authenticate a new client
            # [START create_container_client_sastoken]
            from azure.storage.blob import ContainerClient
            container = ContainerClient.from_container_url(
                container_url=
                "https://account.blob.core.windows.net/mycontainer",
                credential=sas_token)
            # [END create_container_client_sastoken]

        finally:
            # Delete container
            container_client.delete_container()
Esempio n. 12
0
    def get_storage_client(datasets_table: Mapping[str, Any],
                           dataset_name: str) -> ContainerClient:
        """Gets a ContainerClient for the Azure Blob Storage Container
        corresponding to the given dataset.

        Adds 'container_client' key to datasets_table (in-place update) if a new
        ContainerClient is created for the dataset.

        Args:
            datasets_table: dict, the return value of get_datasets_table()
            dataset_name: str, key in datasets_table

        Returns: azure.storage.blob.ContainerClient, corresponds to the
            requested dataset
        """
        if dataset_name not in datasets_table:
            raise KeyError(f'Dataset {dataset_name} is not in datasets table.')

        entry = datasets_table[dataset_name]
        if 'storage_container_client' not in entry:
            # create a new storage container client for this dataset,
            # and cache it
            if 'container_sas_key' not in entry:
                raise KeyError(
                    f'Dataset {dataset_name} does not have the '
                    'container_sas_key field in the datasets table.')
            entry['storage_container_client'] = ContainerClient(
                account_url=f'{entry["storage_account"]}.blob.core.windows.net',
                container_name=entry['container'],
                credential=entry['container_sas_key'])

        return entry['storage_container_client']
Esempio n. 13
0
def list_blob_in_container(connection_s: str, container_n: str) -> list:
    """ list the blobs within a given container of an Azure storage account
    Helper function for debugging in case no access to azure

    Arguments:
        connection_s {str} -- an azure storage account connection string
        container_n {str} -- a container within a storage account

    Returns:
        blob_names_list -- the list of blobs within container
    """
    try:
        campaign_container = ContainerClient.from_connection_string(
            conn_str=connection_s, container_name=container_n)
        blob_list = campaign_container.list_blobs()
        blob_names_list = []
        for blob in blob_list:
            blob_names_list.append(blob.name)
        return blob_names_list
    except:
        logger.info(
            "The container you are trying to list blob from probably does not exist."
        )
        logger.info(
            "Early exit of ETL process as container probably does not exist.")
        exit()
Esempio n. 14
0
def get_newest_file(container_name, substring):
    newest_filename = ""
    i = 0
    try:
        with ContainerClient.from_connection_string(
                storageConnectionString, container_name) as container_client:
            blob_list = container_client.list_blobs()
            for filename in blob_list:
                if substring in filename.name:
                    i += 1
                    try:
                        timestamp = datetime.datetime.strptime(
                            filename.name[len(substring):], '%Y-%m-%d_%H-%M')
                    except:
                        continue
                    if i == 1:
                        newest_timestamp = timestamp
                        newest_filename = filename.name
                    else:
                        if (newest_timestamp < timestamp):
                            newest_timestamp = timestamp
                            newest_filename = filename.name
    except Exception as ex:
        i = 0
        print(ex)

    return newest_filename, i > 0
 def __init__(self, connection_string: str, storage_name: str) -> None:
     self.__client = ContainerClient.from_connection_string(conn_str=connection_string, container_name=storage_name)
     self.__remote_files_cache: Optional[List[dict]] = None
     super().__init__(
         remote_root_dir=Path(""),
         local_root_dir=Path(DEFAULT_ROOT_DIR, storage_name),
     )
Esempio n. 16
0
    def test_sas_signature_is_scrubbed_off(self, storage_account_name,
                                           storage_account_key):
        # SAS URL is calculated from storage key, so this test runs live only
        bsc = BlobServiceClient(self.account_url(storage_account_name, "blob"),
                                storage_account_key)
        self._setup(bsc)
        # Arrange
        container = bsc.get_container_client(self.container_name)
        token = generate_container_sas(
            container.account_name,
            container.container_name,
            account_key=container.credential.account_key,
            permission=ContainerSasPermissions(read=True),
            expiry=datetime.utcnow() + timedelta(hours=1),
        )
        # parse out the signed signature
        token_components = parse_qs(token)
        signed_signature = quote(
            token_components[QueryStringConstants.SIGNED_SIGNATURE][0])

        sas_service = ContainerClient.from_container_url(container.url,
                                                         credential=token)

        # Act
        with LogCaptured(self) as log_captured:
            sas_service.get_account_information(logging_enable=True)
            log_as_str = log_captured.getvalue()

            # Assert
            # make sure the query parameter 'sig' is logged, but its value is not
            self.assertTrue(
                QueryStringConstants.SIGNED_SIGNATURE in log_as_str)
            self.assertFalse(signed_signature in log_as_str)
Esempio n. 17
0
    def upload_json(self,rawdata,fname):

        CONNECT_STR = "DefaultEndpointsProtocol=https;AccountName=stosblobv2;AccountKey=4lcPBLS0bAypEaU1QFGd4QadH5WzvyL3vy3IS+gNhrij4I1dPaXcu9ATl+XdrctTQlH8/oG3qKpdy19FYg6WEg==;EndpointSuffix=core.windows.net"
        CONTAINER_NAME = "test"

        # Instantiate a ContainerClient. This is used when uploading a blob from your local file.
        container_client = ContainerClient.from_connection_string(
            conn_str=CONNECT_STR, 
            container_name=CONTAINER_NAME
        )
        data = rawdata
        output_blob_name = fname

        #This is an optional setting for guaranteeing the MIME type to be always json.
        content_setting = ContentSettings(
            content_type='application/json', 
            content_encoding=None, 
            content_language=None, 
            content_disposition=None, 
            cache_control=None, 
            content_md5=None
        )

        # Upload file

        container_client.upload_blob(
            name=output_blob_name, 
            data=data, 
            content_settings=content_setting)
                
        # Check the result
        all_blobs = container_client.list_blobs(name_starts_with="BLOB", include=None)
        for each in all_blobs:
            print("RES: ", each)    
Esempio n. 18
0
    def __init__(
        self,
        container=None,
        prefix='',
        account_name=None,
        account_key=None,
        blob_service_kwargs=None,
        dimension_separator=None,
        client=None,
    ):
        self._dimension_separator = dimension_separator
        self.prefix = normalize_storage_path(prefix)
        if client is None:
            # deprecated option, try to construct the client for them
            msg = (
                "Providing 'container', 'account_name', 'account_key', and 'blob_service_kwargs'"
                "is deprecated. Provide and instance of 'azure.storage.blob.ContainerClient' "
                "'client' instead.")
            warnings.warn(msg, FutureWarning, stacklevel=2)
            from azure.storage.blob import ContainerClient
            blob_service_kwargs = blob_service_kwargs or {}
            client = ContainerClient(
                "https://{}.blob.core.windows.net/".format(account_name),
                container,
                credential=account_key,
                **blob_service_kwargs)

        self.client = client
        self._container = container
        self._account_name = account_name
        self._account_key = account_key
def copy_output(step_id,
                env):
    account_url = f'https://{env.scoring_datastore_storage_name}.blob.core.windows.net'
    src_blob_name = f'azureml/{step_id}/{env.scoring_datastore_storage_name}_out/parallel_run_step.txt'
    src_blob_url = f'{account_url}/{env.scoring_datastore_output_container}/{src_blob_name}'
    container_client = ContainerClient(account_url=account_url,
                                       container_name=env.scoring_datastore_output_container,
                                       credential=env.scoring_datastore_access_key)
    src_blob_properties = container_client.get_blob_client(src_blob_name).get_blob_properties()
    
    destfolder = src_blob_properties.last_modified.date().isoformat()
    file_time = (src_blob_properties.last_modified.time()).isoformat('milliseconds').replace(':','_').replace('.','_')
    filename_parts = env.scoring_datastore_output_filename.split('.')
    dest_blob_name = f'{destfolder}/{filename_parts[0]}_{file_time}.{filename_parts[1]}'
    dest_client = container_client.get_blob_client(dest_blob_name)
    dest_client.start_copy_from_url(src_blob_url)
Esempio n. 20
0
def download_file(save_path, cloud_file_name, container_name):

    blobnames = []
    try:
        with ContainerClient.from_connection_string(
                storageConnectionString, container_name) as container_client:

            # First check if the file actually exists
            blob_list = container_client.list_blobs()
            for blob in blob_list:
                if (blob.name == cloud_file_name):
                    blobnames.append(blob.name)
            # Download files to specified download folder
            if len(blobnames) == 1:
                for filename in blobnames:
                    with container_client.get_blob_client(
                            filename) as blob_client:
                        with open(os.path.join(save_path, filename),
                                  "wb") as file_path:
                            file_path.write(
                                blob_client.download_blob().readall())
                print("Downloaded file: " + str(cloud_file_name))
                return os.path.join(save_path, filename), True
            else:
                print("Could not find requested blob ", str(cloud_file_name),
                      " in the following list:")
                for blob in blob_list:
                    print(blob.name)
                return " ", False
    except Exception as ex:
        print('Azure Blob Storage Exception:')
        print(ex)
        return " ", False
Esempio n. 21
0
def upload_blob(
    container: ContainerClient,
    blob_name: str,
    content_type: str,
    content_encoding: str,
    data: Any,
    return_sas_token: bool = True,
) -> str:
    """
    Uploads the given data to a blob record.
    If a blob with the given name already exist, it throws an error.

    Returns a uri with a SAS token to access the newly created blob.
    """
    create_container_using_client(container)
    logger.info(f"Uploading blob '{blob_name}'" +
                f"to container '{container.container_name}'" +
                f"on account: '{container.account_name}'")

    content_settings = ContentSettings(content_type=content_type,
                                       content_encoding=content_encoding)

    blob = container.get_blob_client(blob_name)

    blob.upload_blob(data, content_settings=content_settings)
    logger.debug(f"  - blob '{blob_name}' uploaded. generating sas token.")

    if return_sas_token:
        uri = get_blob_uri_with_sas_token(blob)
    else:
        uri = remove_sas_token(blob.url)
    logger.debug(f"  - blob access url: '{uri}'.")

    return uri
Esempio n. 22
0
def ProcessAllImages():

    print('Processing Images.')

    container = ContainerClient.from_connection_string(
        connection_string, container_name=container_name_images)

    blobs_list = container.list_blobs()

    token = GetSASToken()

    for blob in blobs_list:
        blob_client = container.get_blob_client(blob.name)

        # Generate filename for image metadata file.
        fileName = GetFilePathFromImageURL(blob_client.url)
        nycImageUrl = f'{blob_client.url}?{token}'

        try:
            jsonImageMetadata = GetNYCImageMetadata(nycImageUrl)
            SaveImageMetadata(jsonImageMetadata, fileName)

            print(f'Completed processing {fileName}.')
        except Exception as e:
            print(f"Error {e} - {fileName}")
Esempio n. 23
0
    def __init__(self, crawler, output_format):
        from azure.storage.blob import ContainerClient

        feed_uri = crawler.settings.get("FEED_URI")
        account_name, account_key = feed_uri[8::].split("@")[0].split(":")
        self.spider = crawler.spider
        self.container = feed_uri.split("@")[1].split("/")[0]
        self.container_client = ContainerClient(
            "{}.blob.core.windows.net".format(account_name),
            self.container,
            credential=account_key,
        )
        self.feed_prefix = crawler.settings.get(
            "CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d"
        )
        super().__init__(crawler, output_format)
Esempio n. 24
0
    def __init__(
        self,
        blob_storage_conn_str,
        container_base,
        container_processed,
        container_quarantined,
    ):
        """Creates a Blob Storage object

        ...

        Parameters
        -------
        blob_storage_conn_str: str
            Connection string with Blob Storage.
        container_base: str
            Container where files will be read.
        container_processed: str
            Container where files will be sent after being processed
            (Processed with sucess).
        container_quarantined: str
            Container that stores quarantine files (Processed with failure).

        """

        self.blob_storage_conn_str = blob_storage_conn_str
        self.container_base = container_base
        self.container_processed = container_processed
        self.container_quarantined = container_quarantined

        # init blob service & container connectivity
        # instantiate the blob storage class to perform operations on it
        self.blob_service_client = BlobServiceClient.from_connection_string(
            conn_str=self.blob_storage_conn_str)
        self.get_container_base_info = ContainerClient.from_connection_string(
            conn_str=self.blob_storage_conn_str,
            container_name=self.container_base)
        log.info(
            f"successfully established connection with container base: {self.container_base}"
        )
        print(
            f"successfully established connection with container base: {self.container_base}"
        )

        # get sku of the blob storage account
        account_info = self.get_container_base_info.get_account_information()
        log.info("storage sku: {}".format(account_info["sku_name"].lower()))

        # get stats of blob storage & container service info
        stats_blob_storage = self.blob_service_client.get_service_stats()
        log.info("blob storage replication status: {}".format(
            stats_blob_storage["geo_replication"]["status"]))
        log.info("last blob storage sync replication time: {}".format(
            self.utc_to_local(
                stats_blob_storage["geo_replication"]["last_sync_time"])))
        stats_container_base = self.get_container_base_info.get_container_properties(
        )
        log.info("last container modified time: {}".format(
            self.utc_to_local(stats_container_base.last_modified)))
Esempio n. 25
0
def find_azure_storage_blob_file_names(conn_str, container_name, prefix=''):
    """
    Fetched all the files in the bucket which are returned in a list as 
    Google Blob objects
    """
    container = ContainerClient.from_connection_string(
        conn_str=conn_str, container_name=container_name)
    return list(container.list_blobs(prefix=prefix))
Esempio n. 26
0
def getBlobUrl(imagename, connectionString):
    try:
        container_client = ContainerClient.from_connection_string(
            conn_str=connectionString, container_name="droneimages")
        blob_client = container_client.get_blob_client(imagename)
        return blob_client.url
    except:
        xlog('getBlobUrl: error:', sys.exc_info()[0])
Esempio n. 27
0
 def __init__(self, account_name, account_key, container_name, source,
              *args, **kwargs):
     super(ConnLocal2AzureOperator, self).__init__(*args, **kwargs)
     self.client = ContainerClient(
         account_url=f"https://{account_name}.blob.core.windows.net/",
         credential=account_key,
         container_name=container_name)
     self.source = source
Esempio n. 28
0
 def bucket_exists(self):
     container = ContainerClient.from_connection_string(self._account, self._bucket, connection_timeout=300)
     try:
         container.get_container_properties()
         log.debug(output_messages['DEBUG_CONTAINER_ALREADY_EXISTS'] % self._bucket, class_name=AZURE_STORAGE_NAME)
         return True
     except Exception:
         return False
Esempio n. 29
0
class AzureDiffPipeline(DiffPipeline):
    """Azure Blob Storage backend for comparing previously scraped JSCalendar outputs"""

    def __init__(self, crawler, output_format):
        from azure.storage.blob import ContainerClient

        feed_uri = crawler.settings.get("FEED_URI")
        account_name, account_key = feed_uri[8::].split("@")[0].split(":")
        self.spider = crawler.spider
        self.container = feed_uri.split("@")[1].split("/")[0]
        self.container_client = ContainerClient(
            "{}.blob.core.windows.net".format(account_name),
            self.container,
            credential=account_key,
        )
        self.feed_prefix = crawler.settings.get(
            "CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d"
        )
        super().__init__(crawler, output_format)

    def load_previous_results(self):
        max_days_previous = 3
        days_previous = 0
        tz = timezone(self.spider.timezone)
        while days_previous <= max_days_previous:
            matching_blobs = self.container_client.list_blobs(
                name_starts_with=(
                    tz.localize(datetime.now()) - timedelta(days=days_previous)
                ).strftime(self.feed_prefix)
            )
            spider_blobs = [
                blob
                for blob in matching_blobs
                if "{}.".format(self.spider.name) in blob.name
            ]
            if len(spider_blobs) > 0:
                break
            days_previous += 1

        if len(spider_blobs) == 0:
            return []

        blob = sorted(spider_blobs, key=attrgetter("name"))[-1]
        feed_blob = self.container_client.get_blob_client(blob.name)
        feed_text = feed_blob.download_blob().content_as_text()
        return [json.loads(line) for line in feed_text.split("\n") if line.strip()]
    def test_translation_with_glossary(self, client):
        doc = Document(data=b'testing')
        source_container_sas_url = self.create_source_container(data=[doc])
        target_container_sas_url = self.create_target_container()

        container_client = ContainerClient(self.storage_endpoint, self.source_container_name,
                                           self.storage_key)
        with open(GLOSSARY_FILE_NAME, "rb") as fd:
            container_client.upload_blob(name=GLOSSARY_FILE_NAME, data=fd.read())

        prefix, suffix = source_container_sas_url.split("?")
        glossary_file_sas_url = prefix + "/" + GLOSSARY_FILE_NAME + "?" + suffix

        poller = client.begin_translation(
            source_container_sas_url,
            target_container_sas_url,
            "es",
            glossaries=[TranslationGlossary(glossary_url=glossary_file_sas_url, file_format="csv")]
        )
        result = poller.result()

        container_client = ContainerClient(self.storage_endpoint, self.target_container_name,
                                           self.storage_key)

        # download translated file and assert that translation reflects glossary changes
        document = doc.name + doc.suffix
        with open(document, "wb") as my_blob:
            download_stream = container_client.download_blob(document)
            my_blob.write(download_stream.readall())

        with open(document, "rb") as fd:
            translated = fd.readline()

        assert b'essai' in translated  # glossary worked
        os.remove(document)