예제 #1
0
def load_data():
    """Function to load the input data from blob storage.
    """

    account_url = "https://hecdf.blob.core.windows.net"

    facts_blob_service = ContainerClient(account_url=account_url,
                                         container_name=facts_container,
                                         credential=facts_sas_token)

    print('````````````````````````````````````')
    print('        Begin loding data...')
    print('````````````````````````````````````')

    for blob in list(facts_blob_service.list_blobs()):
        file_name = blob.name
        print(file_name)
        download_stream = facts_blob_service.get_blob_client(
            file_name).download_blob()

        Path(f'./data/raw_in/{file_name}').parent.mkdir(parents=True,
                                                        exist_ok=True)

        with open(f"./data/raw_in/{file_name}", "wb") as data:
            data.write(download_stream.readall())

    print('````````````````````````````````````')
    print('        Finished loading data!')
    print('````````````````````````````````````')

    return 0
예제 #2
0
def get_preprocessed_file_from_azure():
    service = ContainerClient(
        account_url=cfg.azure_details['account_url'],
        container_name=cfg.azure_details['storage_preprocessed_transcripts'],
        credential=cfg.azure_details['azure_storage_account_key'])
    blob_list = service.list_blobs()
    blob_name = ''
    for blob in blob_list:
        blob_name = blob.name

    # Create the BlobServiceClient object which will be used to create a container client
    blob_service_client = BlobServiceClient.from_connection_string(
        cfg.azure_details['account_connection_string'])
    local_file_name = blob_name
    full_path_to_file = os.path.join(local_path, local_file_name)
    # Create a blob client using the local file name as the name for the blob
    container_name = "preprocessed-transcripts"
    blob_client = blob_service_client.get_blob_client(
        container=cfg.azure_details['storage_preprocessed_transcripts'],
        blob=local_file_name)
    download_file_path = os.path.join(local_path, local_file_name)

    with open(download_file_path, "wb") as download_file:
        download_file.write(blob_client.download_blob().readall())

    return local_file_name
예제 #3
0
파일: view.py 프로젝트: yasir002/demo
def home_page(request):
    """
    Access new orders from Azure 'new-orders' container,
    splits the order number and submission date from
    order's name and lists them to home page.
    """
    container_client = ContainerClient(storage_url,
                                       'new-orders',
                                       credential=None)
    new_orders = container_client.list_blobs()

    order_list = []
    i = 1
    for order in new_orders:
        order_no = order['name'].split('.')[0]
        creation_date = order['name'].split('.')[1]
        order_no_and_date_tuple = [order_no, creation_date]
        order_list.append(order_no_and_date_tuple)
        i += 1

    order_list.sort(reverse=True)
    n = 1
    for order in order_list:
        order.insert(0, n)
        n += 1
    paginator = Paginator(order_list, 12)
    page = request.GET.get('page', 1)
    orders = paginator.page(page)
    context = {
        'new_orders': orders,
        'page_obj': orders,
        'redirect_url': 'home_page',
    }
    # print(order_list)
    return JsonResponse(order_list, safe=False)
예제 #4
0
파일: view.py 프로젝트: yasir002/demo
def completed_orders(request):
    """
    Access shippped orders from Azure
    'completed-orders' container, splits the order number and
    submission date from order's name and lists them
    """
    container_client = ContainerClient(storage_url,
                                       'completed-orders',
                                       credential=None)
    updated_order = container_client.list_blobs()
    order_list = []
    i = 1
    for order in updated_order:
        order_no = order['name'].split('.')[0]
        creation_date = order['name'].split('.')[1]
        order_no_and_date_tuple = [order_no, creation_date]
        order_list.append(order_no_and_date_tuple)
        i += 1
    order_list.sort(reverse=True)
    n = 1
    for order in order_list:
        order.insert(0, n)
        n += 1
    page = request.GET.get('page', 1)
    paginator = Paginator(order_list, 6)
    orders = paginator.page(page)
    context = {
        'orders': orders,
        'container': 'completed-orders',
        'redirect_url': 'completed_orders',
    }
    return JsonResponse(order_list, safe=False)
예제 #5
0
def list_blobs():
    print("\nListing blobs...")
    container_client = ContainerClient()
    # List the blobs in the container
    blob_list = container_client.list_blobs()
    for blob in blob_list:
        print("\t" + blob.name)
예제 #6
0
    def get_all_blobs_by_blob_container_name(self, storage_account_name, account_key, container_name):

        container_client = ContainerClient(
            account_url=f'{storage_account_name}.blob.core.windows.net',
            container_name=container_name, credential=account_key)
        blobs = container_client.list_blobs()

        return blobs
예제 #7
0
    def combine_azure(self):
        from azure.storage.blob import ContainerClient, ContentSettings

        feed_uri = self.settings.get("FEED_URI")
        feed_prefix = self.settings.get("CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d")
        account_name, account_key = feed_uri[8::].split("@")[0].split(":")
        container = feed_uri.split("@")[1].split("/")[0]
        container_client = ContainerClient(
            "{}.blob.core.windows.net".format(account_name),
            container,
            credential=account_key,
        )

        max_days_previous = 3
        days_previous = 0
        prefix_blobs = []
        while days_previous <= max_days_previous:
            prefix_blobs = [
                blob
                for blob in container_client.list_blobs(
                    name_starts_with=(
                        datetime.now() - timedelta(days=days_previous)
                    ).strftime(feed_prefix)
                )
            ]
            if len(prefix_blobs) > 0:
                break
            days_previous += 1

        spider_blob_names = self.get_spider_paths([blob.name for blob in prefix_blobs])
        meetings = []
        for blob_name in spider_blob_names:
            feed_blob = container_client.get_blob_client(blob_name)
            feed_text = feed_blob.download_blob().content_as_text()
            meetings.extend(
                [json.loads(line) for line in feed_text.split("\n") if line]
            )
        meetings = sorted(meetings, key=itemgetter(self.start_key))
        yesterday_iso = (datetime.now() - timedelta(days=1)).isoformat()[:19]
        upcoming = [
            meeting
            for meeting in meetings
            if meeting[self.start_key][:19] > yesterday_iso
        ]

        container_client.upload_blob(
            "latest.json",
            "\n".join([json.dumps(meeting) for meeting in meetings]),
            content_settings=ContentSettings(cache_control="no-cache"),
            overwrite=True,
        )

        container_client.upload_blob(
            "upcoming.json",
            "\n".join([json.dumps(meeting) for meeting in upcoming]),
            content_settings=ContentSettings(cache_control="no-cache"),
            overwrite=True,
        )
예제 #8
0
def find_blobs(account_url: str, container_name: str, credential: str,
               prefix: str, suffix: str):
    blob_list = []
    container = ContainerClient(account_url=account_url,
                                container_name=container_name,
                                credential=credential)
    for blob_record in container.list_blobs(name_starts_with=prefix):
        blob_name = blob_record['name']

        if blob_name.endswith(suffix):
            blob_list.append(blob_name)
    return blob_list
def has_blob(container_client: ContainerClient, blob_name: str) -> bool:
    """ Return True if the blob file exists in the container.
    """
    blobs = list(container_client.list_blobs())
    logging.info('container_client.list_blobs')
    logging.info(blobs)

    filtered_blobs = list(filter(lambda b: b['name'] == blob_name, blobs))
    logging.info('filtered_blobs')
    logging.info(filtered_blobs)

    return True if len(filtered_blobs) > 0 else False
예제 #10
0
class DeleteFilesAzure2AzureOperator(BaseOperator):
    @apply_defaults
    def __init__(self, account_name, account_key, container, *args, **kwargs):
        super(DeleteFilesAzure2AzureOperator, self).__init__(*args, **kwargs)
        self.client = ContainerClient(
            account_url=f"https://{account_name}.blob.core.windows.net/",
            credential=account_key,
            container_name=container)

    def execute(self, context):
        for file in self.client.list_blobs():
            bob_cilent = self.client.get_blob_client(file)
            bob_cilent.delete_blob()
예제 #11
0
파일: ops.py 프로젝트: plang85/azblob
def listblobsapi(container, accountname, accountkey=None, nmax=None):
    block_blob_service = ContainerClient(accountname,
                                         container,
                                         credential=accountkey)
    logger.info("listing blobs in '{}/{}'".format(accountname, container))
    if nmax is None:
        nmax = sys.maxsize
    blobs = block_blob_service.list_blobs()
    # TODO use namedtuple
    blob_list = [{
        "name": blob.name,
        "date": blob.creation_time
    } for i, blob in enumerate(blobs) if i < nmax]
    return blob_list
예제 #12
0
class AzureDiffPipeline(DiffPipeline):
    """Azure Blob Storage backend for comparing previously scraped JSCalendar outputs"""

    def __init__(self, crawler, output_format):
        from azure.storage.blob import ContainerClient

        feed_uri = crawler.settings.get("FEED_URI")
        account_name, account_key = feed_uri[8::].split("@")[0].split(":")
        self.spider = crawler.spider
        self.container = feed_uri.split("@")[1].split("/")[0]
        self.container_client = ContainerClient(
            "{}.blob.core.windows.net".format(account_name),
            self.container,
            credential=account_key,
        )
        self.feed_prefix = crawler.settings.get(
            "CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d"
        )
        super().__init__(crawler, output_format)

    def load_previous_results(self):
        max_days_previous = 3
        days_previous = 0
        tz = timezone(self.spider.timezone)
        while days_previous <= max_days_previous:
            matching_blobs = self.container_client.list_blobs(
                name_starts_with=(
                    tz.localize(datetime.now()) - timedelta(days=days_previous)
                ).strftime(self.feed_prefix)
            )
            spider_blobs = [
                blob
                for blob in matching_blobs
                if "{}.".format(self.spider.name) in blob.name
            ]
            if len(spider_blobs) > 0:
                break
            days_previous += 1

        if len(spider_blobs) == 0:
            return []

        blob = sorted(spider_blobs, key=attrgetter("name"))[-1]
        feed_blob = self.container_client.get_blob_client(blob.name)
        feed_text = feed_blob.download_blob().content_as_text()
        return [json.loads(line) for line in feed_text.split("\n") if line.strip()]
예제 #13
0
def _azure_get_configs(layer: "Layer") -> List[str]:
    providers = layer.gen_providers(0)

    credentials = Azure.get_credentials()
    storage_account_name = providers["terraform"]["backend"]["azurerm"][
        "storage_account_name"
    ]
    container_name = providers["terraform"]["backend"]["azurerm"]["container_name"]
    storage_client = ContainerClient(
        account_url=f"https://{storage_account_name}.blob.core.windows.net",
        container_name=container_name,
        credential=credentials,
    )
    prefix = "opta_config/"
    blobs = storage_client.list_blobs(name_starts_with=prefix)
    configs = [blob.name[len(prefix) :] for blob in blobs]
    if layer.name in configs:
        configs.remove(layer.name)
    return configs
예제 #14
0
파일: view.py 프로젝트: yasir002/demo
def deleted_orders(request):
    """
    Access all deleted orders from 'deleted-orders' container,
    splits the order number and submission date from order's
    name and lists them
    :param request:
    :return:
    """
    container_client = ContainerClient(storage_url,
                                       'deleted-orders',
                                       credential=None)
    deleted_order_list = container_client.list_blobs()
    order_list = []
    i = 1
    for order in deleted_order_list:
        order_no = order['name'].split('.')[0]
        creation_date = order['name'].split('.')[1]
        order_id_and_date_tuple = [order_no, creation_date]
        order_list.append(order_id_and_date_tuple)
        i += 1
    order_list.sort(reverse=True)
    n = 1
    for order in order_list:
        order.insert(0, n)
        n += 1
    page = request.GET.get('page', 1)
    paginator = Paginator(order_list, 4)
    orders = paginator.page(page)
    # messages.info(request, 'yes man yes')
    context = {
        'deleted_orders': orders,
        'redirect_url': 'deleted_orders',
    }
    print(request.COOKIES)
    response = render(request, 'deleted_orders.html', context)
    # response.delete_cookie('cookie_name1')

    return response
예제 #15
0
def get_blob_manifest(container_client: ContainerClient) -> (list):
    '''Returns list of filenames.'''
    return [y.name for y in [x for x in container_client.list_blobs()]]
예제 #16
0
def list_blobs(
    container_client: ContainerClient, name_starts_with: Optional[str] = None
) -> List[str]:
    """Return a list of blobs."""

    return [blob["name"] for blob in container_client.list_blobs(name_starts_with)]
예제 #17
0
def main(myblob: func.InputStream):
    try:

        logging.info('Python blob trigger function processed a request.')

        account_name = "###" # confidential
        account_key = "###" # confidential
        top_level_container_name = "###" # confidential
        blob_service = ContainerClient(account_url=account_name, container_name=top_level_container_name, credential=account_key)

        # Make connection with Azure SQL database

        server = 'datatrust-ff.database.windows.net' 
        database = 'DataTrust' 
        username = '******' 
        password = '******' 
        cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
        cursor = cnxn.cursor()
        cursor.execute("SELECT @@version;")
        row = cursor.fetchone() 
        while row:
            print(row[0])
            row = cursor.fetchone()

        # Download xlsx-files from Azure blob storage

        logging.info("\nList blobs in the container")
        generator = blob_service.list_blobs()
        for blob in generator:
            if blob.name.endswith('.xlsx'):
                logging.info("\t Blob name: " + blob.name)
                file_name = re.sub('.*/', '', blob.name)
                xlsx_file = open(file_name, 'wb')
                b = blob_service.download_blob(blob)
                b.readinto(xlsx_file)
                #xlsx_file.write(b)
                xlsx_file = open(file_name, 'rb')
                data = xlsx_file.read()
                data = pd.read_excel(data)
                headers = list(data.columns.values)
                list_values = data.values.tolist()
                blob_name = blob.name

                tableName = re.sub('.xlsx', '', blob_name)
                tableName = re.sub('^.*/', '', tableName)
                tableName = re.sub(' ', '_', tableName)

                # If table exists: remove and rewrite

                try:
                    cursor.execute("DROP TABLE dbo." + tableName)
                except:
                    print('Table does not exist yet')

                # Create new table

                query_string = 'CREATE TABLE dbo.' + tableName + ' ('

                # Add columns to table

                columns = ''
                for i in range(len(headers)):
                    headers[i] = re.sub('[ /-]', '_', str(headers[i]))
                    headers[i] = re.sub("[\(\)€'\.,]", '', str(headers[i]))
                    columns += headers[i] + ', '
                    if i == len(headers) - 1:
                        query_string += '\n' + headers[i] + ' VARCHAR(1000)'
                    else:
                        query_string += '\n' + headers[i] + ' VARCHAR(1000),'
                query_string += '\n);'
                query_string = re.sub('[/-]', '', query_string)
                cursor.execute(query_string)

                # Add rows to table

                query_string = "INSERT INTO dbo." + tableName + "(" + columns[:-2] +") VALUES "
                for row in range(len(list_values)):
                    list_values[row] = [str(i) for i in list_values[row]]
                    row_new = []
                    for item in list_values[row]:
                        item = re.sub('[\(\)\r\n\,\'\-]', '', item)
                        item = "'" + item + "'"
                        row_new.append(item)

                    row_new = ','.join(row_new)
                    if (row + 1) % 1000 == 0 or row + 1 == len(list_values):
                        query_string += '(' + row_new + ');'
                        print(query_string)
                        cursor.execute(query_string)
                        query_string = "INSERT INTO dbo." + tableName + "(" + columns[:-2] +") VALUES "
                    else:
                        query_string += '(' + row_new + '),'
                    
                cnxn.commit()

    except Exception as e:
        logging.exception(e)
예제 #18
0
def get_blob_list_information(container_client: ContainerClient) -> (list):
    '''Returns list of tuples with information about blobs..'''
    return [(y.name, y.blob_tier, y.size) for y in [x for x in container_client.list_blobs()]]
예제 #19
0
    )

account_info = blob_service_client.get_account_information()

container_name = config['container_name']
container_client = ContainerClient(
    account_url=storage_url,
    container_name=container_name,
    credential=storage_key
    )



print(dir(container_client))
#print(dir(blob_client))
blob_list = container_client.list_blobs()
for blob in blob_list:
    print("\t" + blob.name)

blob_client_positions = BlobClient(
    account_url=storage_url, 
    container_name=storage_name, 
    blob_name="104人力銀行_SAP_positions.csv", 
    credential=storage_key
    )

blob_client_companies = BlobClient(
    account_url=storage_url, 
    container_name=storage_name, 
    blob_name="104人力銀行_SAP_companies.csv", 
    credential=storage_key
예제 #20
0
def check_subscription(tenant_id, tenant_name, sub_id, sub_name, creds):
    print("\n\t[*] Checking subscription {}:".format(sub_name), flush=True)

    storage_client = StorageManagementClient(creds, sub_id)

    # Obtain the management object for resources
    resource_client = ResourceManagementClient(creds, sub_id)

    # Retrieve the list of resource groups
    group_list = resource_client.resource_groups.list()
    resource_groups = [group.name for group in list(group_list)]
    print("\t\t[+] Found {} resource groups".format(len(resource_groups)),
          flush=True)
    group_to_names_dict = {group: dict() for group in resource_groups}

    accounts_counter = 0
    for group in resource_groups:
        for item in storage_client.storage_accounts.list_by_resource_group(
                group):
            accounts_counter += 1
            group_to_names_dict[group][item.name] = ''

    print("\t\t[+] Found {} storage accounts".format(accounts_counter),
          flush=True)

    for group in resource_groups:
        for account in group_to_names_dict[group].keys():
            try:
                storage_keys = storage_client.storage_accounts.list_keys(
                    group, account)
                storage_keys = {v.key_name: v.value for v in storage_keys.keys}
                group_to_names_dict[group][account] = storage_keys['key1']

            except azure.core.exceptions.HttpResponseError:
                print(
                    "\t\t[-] User do not have permissions to retrieve storage accounts keys in the given"
                    " subscription",
                    flush=True)
                print("\t\t    Can not scan storage accounts", flush=True)
                return

    output_list = list()

    for group in resource_groups:
        for account in group_to_names_dict[group].keys():
            key = group_to_names_dict[group][account]
            public_containers = check_storage_account(account, key)

            for cont in public_containers:
                access_level = cont.public_access
                container_client = ContainerClient(
                    ENDPOINT_URL.format(account), cont.name, credential=key)
                files = [f.name for f in container_client.list_blobs()]
                ext_dict = count_files_extensions(files, EXTENSIONS)
                row = [
                    tenant_id, tenant_name, sub_id, sub_name, group, account,
                    cont.name, access_level,
                    CONTAINER_URL.format(account, cont.name),
                    len(files)
                ]

                for ext in ext_dict.keys():
                    row.append(ext_dict[ext])

                output_list.append(row)

    print("\t\t[+] Scanned all storage accounts successfully", flush=True)

    if len(output_list) > 0:
        print("\t\t[+] Found {} PUBLIC containers".format(len(output_list)),
              flush=True)
    else:
        print("\t\t[+] No PUBLIC containers found")

    header = [
        "Tenant ID", "Tenant Name", "Subscription ID", "Subscription Name",
        "Resource Group", "Storage Account", "Container",
        "Public Access Level", "URL", "Total Files"
    ]

    for ext in EXTENSIONS:
        header.append(ext)

    header.append("others")
    write_csv('public-containers-{}.csv'.format(date.today()), header,
              output_list)
예제 #21
0
class TartanAir(object):
    def on_start(self):
        """This function is called once the Block is started
        """
        account_url = 'https://tartanair.blob.core.windows.net/'
        container_name = 'tartanair-release1'
        self.container_client = ContainerClient(account_url=account_url,
                                                container_name=container_name,
                                                credential=None)
        self.envlist = [
            'abandonedfactory/', 'abandonedfactory_night/', 'amusement/',
            'carwelding/', 'endofworld/', 'gascola/', 'hospital/',
            'japanesealley/', 'neighborhood/', 'ocean/', 'office/', 'office2/',
            'oldtown/', 'seasidetown/', 'seasonsforest/',
            'seasonsforest_winter/', 'soulcity/', 'westerndesert/'
        ]
        self.diff_level = ["Easy",
                           "Hard"][int(self.get_property("diff_level"))]
        self.env_ind = self.get_property("env_ind")
        self.trajlist = self.get_trajectory_list(self.envlist[self.env_ind],
                                                 easy_hard=self.diff_level)
        self.trajs_len = len(self.trajlist)
        self.traj_id = self.get_property("traj_id")
        self.alert(
            "Selected Environment: {}".format(self.envlist[self.env_ind]),
            "INFO")
        self.alert("Difficulty Level: {}".format(self.diff_level), "INFO")
        self.alert(
            "Number of available trajectories: {}".format(self.trajs_len),
            "INFO")
        if (self.traj_id >= self.trajs_len):
            self.alert(
                "Trajectory id out of range[0, {}]".format(self.trajs_len - 1),
                "ERROR")
        self.frequency = self.get_property("fps")
        self.traj_dir = self.trajlist[self.traj_id]
        # Load Images List
        self.left_img_list = self.get_image_list(self.traj_dir,
                                                 left_right='left')
        print('Find {} left images in {}'.format(len(self.left_img_list),
                                                 self.traj_dir))
        self.right_img_list = self.get_image_list(self.traj_dir,
                                                  left_right='right')
        self.left_depth_list = self.get_depth_list(self.traj_dir,
                                                   left_right='left')
        self.right_depth_list = self.get_depth_list(self.traj_dir,
                                                    left_right='right')
        self.left_seg_list = self.get_seg_list(self.traj_dir,
                                               left_right='left')
        self.right_seg_list = self.get_seg_list(self.traj_dir,
                                                left_right='left')
        self.flow_list = self.get_flow_list(self.traj_dir)
        self.flow_mask_list = self.get_flow_mask_list(self.traj_dir)
        self.left_pose_file = self.get_posefile(self.traj_dir,
                                                left_right='left')
        self.right_pose_file = self.get_posefile(self.traj_dir,
                                                 left_right='right')
        # Load poses
        bc = self.container_client.get_blob_client(blob=self.left_pose_file)
        data = bc.download_blob()
        text_file = open("OutputL.txt", "w")
        text_file.write(data.content_as_text())
        text_file.close()
        self.pose_l = np.loadtxt("OutputL.txt")
        bc = self.container_client.get_blob_client(blob=self.right_pose_file)
        data = bc.download_blob()
        text_file = open("OutputR.txt", "w")
        text_file.write(data.content_as_text())
        text_file.close()
        self.pose_r = np.loadtxt("OutputR.txt")

    def run(self):
        ltime = time.time()
        idx = 0
        while True:
            if (time.time() - ltime >= 1 / self.frequency):
                if (idx == len(self.left_img_list)):
                    idx = 0
                # RGB Images
                left_img = self.read_image_file(self.left_img_list[idx])
                right_img = self.read_image_file(self.right_img_list[idx])
                header = Header()
                set_timestamp(header, time.time())
                header.frame_id = "left_img"
                left_msg = from_ndarray(left_img, header)
                self.publish("left_img", left_msg)
                header.frame_id = "right_img"
                right_msg = from_ndarray(right_img, header)
                self.publish("right_img", right_msg)
                # Depth Images
                left_depth = self.read_numpy_file(self.left_depth_list[idx])
                left_depth_vis = depth2vis(left_depth)
                header.frame_id = "left_depth"
                left_msg = from_ndarray(left_depth_vis, header)
                self.publish("left_depth", left_msg)
                right_depth = self.read_numpy_file(self.right_depth_list[idx])
                right_depth_vis = depth2vis(right_depth)
                header.frame_id = "right_depth"
                right_msg = from_ndarray(right_depth_vis, header)
                self.publish("right_depth", right_msg)
                # Semantic Segmentation
                left_seg = self.read_numpy_file(self.left_seg_list[idx])
                left_seg_vis = seg2vis(left_seg)
                header.frame_id = "left_segmentation"
                left_msg = from_ndarray(left_seg_vis, header)
                self.publish("left_segmentation", left_msg)
                right_seg = self.read_numpy_file(self.right_seg_list[idx])
                right_seg_vis = seg2vis(right_seg)
                header.frame_id = "right_segmentation"
                right_msg = from_ndarray(right_seg_vis, header)
                self.publish("right_segmentation", right_msg)
                # Left Camera Pose
                pose_stamped = PoseStamped()
                pose_stamped.header = header
                pose_stamped.header.frame_id = "left_camera"
                pose = Pose()
                pose.position.x = self.pose_l[idx][0]
                pose.position.y = self.pose_l[idx][1]
                pose.position.z = self.pose_l[idx][2]
                pose.orientation.x = self.pose_l[idx][3]
                pose.orientation.y = self.pose_l[idx][4]
                pose.orientation.z = self.pose_l[idx][5]
                pose.orientation.w = self.pose_l[idx][6]
                pose_stamped.pose = pose
                self.publish("left_pose", pose_stamped)
                # Right Camera Pose
                pose_stamped = PoseStamped()
                pose_stamped.header = header
                pose_stamped.header.frame_id = "right_camera"
                pose = Pose()
                pose.position.x = self.pose_r[idx][0]
                pose.position.y = self.pose_r[idx][1]
                pose.position.z = self.pose_r[idx][2]
                pose.orientation.x = self.pose_r[idx][3]
                pose.orientation.y = self.pose_r[idx][4]
                pose.orientation.z = self.pose_r[idx][5]
                pose.orientation.w = self.pose_r[idx][6]
                pose_stamped.pose = pose
                self.publish("right_pose", pose_stamped)

                if (idx > 0):
                    flow = self.read_numpy_file(self.flow_list[idx - 1])
                    flow_vis = flow2vis(flow)
                    header.frame_id = "optical_flow"
                    left_msg = from_ndarray(flow_vis, header)
                    self.publish("optical_flow", left_msg)
                    flow_mask = self.read_numpy_file(self.flow_mask_list[idx -
                                                                         1])
                    flow_vis_w_mask = flow2vis(flow, mask=flow_mask)
                    header.frame_id = "optical_flow_mask"
                    right_msg = from_ndarray(flow_vis_w_mask, header)
                    self.publish("optical_flow_mask", right_msg)

                ltime = time.time()
                idx += 1

    def on_properties_changed(self, affected_properties):
        self.on_start()

    def get_environment_list(self):
        '''
        List all the environments shown in the root directory
        '''
        env_gen = self.container_client.walk_blobs()
        envlist = []
        for env in env_gen:
            envlist.append(env.name)
        return envlist

    def get_trajectory_list(self, envname, easy_hard='Easy'):
        '''
        List all the trajectory folders, which is named as 'P0XX'
        '''
        assert (easy_hard == 'Easy' or easy_hard == 'Hard')
        traj_gen = self.container_client.walk_blobs(name_starts_with=envname +
                                                    '/' + easy_hard + '/')
        trajlist = []
        for traj in traj_gen:
            trajname = traj.name
            trajname_split = trajname.split('/')
            trajname_split = [tt for tt in trajname_split if len(tt) > 0]
            if trajname_split[-1][0] == 'P':
                trajlist.append(trajname)
        return trajlist

    def _list_blobs_in_folder(self, folder_name):
        """
        List all blobs in a virtual folder in an Azure blob container
        """

        files = []
        generator = self.container_client.list_blobs(
            name_starts_with=folder_name)
        for blob in generator:
            files.append(blob.name)
        return files

    def get_image_list(self, trajdir, left_right='left'):
        assert (left_right == 'left' or left_right == 'right')
        files = self._list_blobs_in_folder(trajdir + '/image_' + left_right +
                                           '/')
        files = [fn for fn in files if fn.endswith('.png')]
        return files

    def get_depth_list(self, trajdir, left_right='left'):
        assert (left_right == 'left' or left_right == 'right')
        files = self._list_blobs_in_folder(trajdir + '/depth_' + left_right +
                                           '/')
        files = [fn for fn in files if fn.endswith('.npy')]
        return files

    def get_flow_list(
        self,
        trajdir,
    ):
        files = self._list_blobs_in_folder(trajdir + '/flow/')
        files = [fn for fn in files if fn.endswith('flow.npy')]
        return files

    def get_flow_mask_list(
        self,
        trajdir,
    ):
        files = self._list_blobs_in_folder(trajdir + '/flow/')
        files = [fn for fn in files if fn.endswith('mask.npy')]
        return files

    def get_posefile(self, trajdir, left_right='left'):
        assert (left_right == 'left' or left_right == 'right')
        return trajdir + '/pose_' + left_right + '.txt'

    def get_seg_list(self, trajdir, left_right='left'):
        assert (left_right == 'left' or left_right == 'right')
        files = self._list_blobs_in_folder(trajdir + '/seg_' + left_right +
                                           '/')
        files = [fn for fn in files if fn.endswith('.npy')]
        return files

    def read_numpy_file(
        self,
        numpy_file,
    ):
        '''
        return a numpy array given the file path
        '''
        bc = self.container_client.get_blob_client(blob=numpy_file)
        data = bc.download_blob()
        ee = io.BytesIO(data.content_as_bytes())
        ff = np.load(ee)
        return ff

    def read_image_file(
        self,
        image_file,
    ):
        '''
        return a uint8 numpy array given the file path  
        '''
        bc = self.container_client.get_blob_client(blob=image_file)
        data = bc.download_blob()
        ee = io.BytesIO(data.content_as_bytes())
        img = cv2.imdecode(np.asarray(bytearray(ee.read()), dtype=np.uint8),
                           cv2.IMREAD_COLOR)
        # im_rgb = img[:, :, [2, 1, 0]]  # BGR2RGB
        return img
예제 #22
0
class AzureCloudInterface(CloudInterface):
    # Azure block blob limitations
    # https://docs.microsoft.com/en-us/rest/api/storageservices/understanding-block-blobs--append-blobs--and-page-blobs
    MAX_CHUNKS_PER_FILE = 50000
    # Minimum block size allowed in Azure Blob Storage is 64KB
    MIN_CHUNK_SIZE = 64 << 10

    # Azure Blob Storage permit a maximum of 4.75TB per file
    # This is a hard limit, while our upload procedure can go over the specified
    # MAX_ARCHIVE_SIZE - so we set a maximum of 1TB per file
    MAX_ARCHIVE_SIZE = 1 << 40

    # The size of each chunk in a single object upload when the size of the
    # object exceeds max_single_put_size. We default to 2MB in order to
    # allow the default max_concurrency of 8 to be achieved when uploading
    # uncompressed WAL segments of the default 16MB size.
    DEFAULT_MAX_BLOCK_SIZE = 2 << 20

    # The maximum amount of concurrent chunks allowed in a single object upload
    # where the size exceeds max_single_put_size. We default to 8 based on
    # experiments with in-region and inter-region transfers within Azure.
    DEFAULT_MAX_CONCURRENCY = 8

    # The largest file size which will be uploaded in a single PUT request. This
    # should be lower than the size of the compressed WAL segment in order to
    # force the Azure client to use concurrent chunk upload for archiving WAL files.
    DEFAULT_MAX_SINGLE_PUT_SIZE = 4 << 20

    # The maximum size of the requests connection pool used by the Azure client
    # to upload objects.
    REQUESTS_POOL_MAXSIZE = 32

    def __init__(
        self,
        url,
        jobs=2,
        encryption_scope=None,
        credential=None,
        tags=None,
        max_block_size=DEFAULT_MAX_BLOCK_SIZE,
        max_concurrency=DEFAULT_MAX_CONCURRENCY,
        max_single_put_size=DEFAULT_MAX_SINGLE_PUT_SIZE,
    ):
        """
        Create a new Azure Blob Storage interface given the supplied account url

        :param str url: Full URL of the cloud destination/source
        :param int jobs: How many sub-processes to use for asynchronous
          uploading, defaults to 2.
        """
        super(AzureCloudInterface, self).__init__(
            url=url,
            jobs=jobs,
            tags=tags,
        )
        self.encryption_scope = encryption_scope
        self.credential = credential
        self.max_block_size = max_block_size
        self.max_concurrency = max_concurrency
        self.max_single_put_size = max_single_put_size

        parsed_url = urlparse(url)
        if parsed_url.netloc.endswith(AZURE_BLOB_STORAGE_DOMAIN):
            # We have an Azure Storage URI so we use the following form:
            # <http|https>://<account-name>.<service-name>.core.windows.net/<resource-path>
            # where <resource-path> is <container>/<blob>.
            # Note that although Azure supports an implicit root container, we require
            # that the container is always included.
            self.account_url = parsed_url.netloc
            try:
                self.bucket_name = parsed_url.path.split("/")[1]
            except IndexError:
                raise ValueError("azure blob storage URL %s is malformed" %
                                 url)
            path = parsed_url.path.split("/")[2:]
        else:
            # We are dealing with emulated storage so we use the following form:
            # http://<local-machine-address>:<port>/<account-name>/<resource-path>
            logging.info("Using emulated storage URL: %s " % url)
            if "AZURE_STORAGE_CONNECTION_STRING" not in os.environ:
                raise ValueError(
                    "A connection string must be provided when using emulated storage"
                )
            try:
                self.bucket_name = parsed_url.path.split("/")[2]
            except IndexError:
                raise ValueError("emulated storage URL %s is malformed" % url)
            path = parsed_url.path.split("/")[3:]

        self.path = "/".join(path)

        self.bucket_exists = None
        self._reinit_session()

    def _reinit_session(self):
        """
        Create a new session
        """
        if self.credential:
            # Any supplied credential takes precedence over the environment
            credential = self.credential
        elif "AZURE_STORAGE_CONNECTION_STRING" in os.environ:
            logging.info("Authenticating to Azure with connection string")
            self.container_client = ContainerClient.from_connection_string(
                conn_str=os.getenv("AZURE_STORAGE_CONNECTION_STRING"),
                container_name=self.bucket_name,
            )
            return
        else:
            if "AZURE_STORAGE_SAS_TOKEN" in os.environ:
                logging.info("Authenticating to Azure with SAS token")
                credential = os.getenv("AZURE_STORAGE_SAS_TOKEN")
            elif "AZURE_STORAGE_KEY" in os.environ:
                logging.info("Authenticating to Azure with shared key")
                credential = os.getenv("AZURE_STORAGE_KEY")
            else:
                logging.info(
                    "Authenticating to Azure with default credentials")
                # azure-identity is not part of azure-storage-blob so only import
                # it if needed
                try:
                    from azure.identity import DefaultAzureCredential
                except ImportError:
                    raise SystemExit(
                        "Missing required python module: azure-identity")
                credential = DefaultAzureCredential()
        session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(
            pool_maxsize=self.REQUESTS_POOL_MAXSIZE)
        session.mount("https://", adapter)
        self.container_client = ContainerClient(
            account_url=self.account_url,
            container_name=self.bucket_name,
            credential=credential,
            max_single_put_size=self.max_single_put_size,
            max_block_size=self.max_block_size,
            session=session,
        )

    @property
    def _extra_upload_args(self):
        optional_args = {}
        if self.encryption_scope:
            optional_args["encryption_scope"] = self.encryption_scope
        return optional_args

    def test_connectivity(self):
        """
        Test Azure connectivity by trying to access a container
        """
        try:
            # We are not even interested in the existence of the bucket,
            # we just want to see if Azure blob service is reachable.
            self.bucket_exists = self._check_bucket_existence()
            return True
        except (HttpResponseError, ServiceRequestError) as exc:
            logging.error("Can't connect to cloud provider: %s", exc)
            return False

    def _check_bucket_existence(self):
        """
        Chck Azure Blob Storage for the target container

        Although there is an `exists` function it cannot be called by container-level
        shared access tokens. We therefore check for existence by calling list_blobs
        on the container.

        :return: True if the container exists, False otherwise
        :rtype: bool
        """
        try:
            self.container_client.list_blobs().next()
        except ResourceNotFoundError:
            return False
        except StopIteration:
            # The bucket is empty but it does exist
            pass
        return True

    def _create_bucket(self):
        """
        Create the container in cloud storage
        """
        # By default public access is disabled for newly created containers.
        # Unlike S3 there is no concept of regions for containers (this is at
        # the storage account level in Azure)
        self.container_client.create_container()

    def _walk_blob_tree(self, obj, ignore=None):
        """
        Walk a blob tree in a directory manner and return a list of directories
        and files.

        :param ItemPaged[BlobProperties] obj: Iterable response of BlobProperties
          obtained from ContainerClient.walk_blobs
        :param str|None ignore: An entry to be excluded from the returned list,
          typically the top level prefix
        :return: List of objects and directories in the tree
        :rtype: List[str]
        """
        if obj.name != ignore:
            yield obj.name
        if isinstance(obj, BlobPrefix):
            # We are a prefix and not a leaf so iterate children
            for child in obj:
                for v in self._walk_blob_tree(child):
                    yield v

    def list_bucket(self, prefix="", delimiter=DEFAULT_DELIMITER):
        """
        List bucket content in a directory manner

        :param str prefix:
        :param str delimiter:
        :return: List of objects and dirs right under the prefix
        :rtype: List[str]
        """
        res = self.container_client.walk_blobs(name_starts_with=prefix,
                                               delimiter=delimiter)
        return self._walk_blob_tree(res, ignore=prefix)

    def download_file(self, key, dest_path, decompress=None):
        """
        Download a file from Azure Blob Storage

        :param str key: The key to download
        :param str dest_path: Where to put the destination file
        :param str|None decompress: Compression scheme to use for decompression
        """
        obj = self.container_client.download_blob(key)
        with open(dest_path, "wb") as dest_file:
            if decompress is None:
                obj.download_to_stream(dest_file)
                return
            blob = StreamingBlobIO(obj)
            decompress_to_file(blob, dest_file, decompress)

    def remote_open(self, key, decompressor=None):
        """
        Open a remote Azure Blob Storage object and return a readable stream

        :param str key: The key identifying the object to open
        :param barman.clients.cloud_compression.ChunkedCompressor decompressor:
          A ChunkedCompressor object which will be used to decompress chunks of bytes
          as they are read from the stream
        :return: A file-like object from which the stream can be read or None if
          the key does not exist
        """
        try:
            obj = self.container_client.download_blob(key)
            resp = StreamingBlobIO(obj)
            if decompressor:
                return DecompressingStreamingIO(resp, decompressor)
            else:
                return resp
        except ResourceNotFoundError:
            return None

    def upload_fileobj(
        self,
        fileobj,
        key,
        override_tags=None,
    ):
        """
        Synchronously upload the content of a file-like object to a cloud key

        :param fileobj IOBase: File-like object to upload
        :param str key: The key to identify the uploaded object
        :param List[tuple] override_tags: List of tags as k,v tuples to be added to the
          uploaded object
        """
        # Find length of the file so we can pass it to the Azure client
        fileobj.seek(0, SEEK_END)
        length = fileobj.tell()
        fileobj.seek(0)

        extra_args = self._extra_upload_args.copy()
        tags = override_tags or self.tags
        if tags is not None:
            extra_args["tags"] = dict(tags)
        self.container_client.upload_blob(name=key,
                                          data=fileobj,
                                          overwrite=True,
                                          length=length,
                                          max_concurrency=self.max_concurrency,
                                          **extra_args)

    def create_multipart_upload(self, key):
        """No-op method because Azure has no concept of multipart uploads

        Instead of multipart upload, blob blocks are staged and then committed.
        However this does not require anything to be created up front.
        This method therefore does nothing.
        """
        pass

    def _upload_part(self, upload_metadata, key, body, part_number):
        """
        Upload a single block of this block blob.

        Uses the supplied part number to generate the block ID and returns it
        as the "PartNumber" in the part metadata.

        :param dict upload_metadata: Provider-specific metadata about the upload
          (not used in Azure)
        :param str key: The key to use in the cloud service
        :param object body: A stream-like object to upload
        :param int part_number: Part number, starting from 1
        :return: The part metadata
        :rtype: dict[str, None|str]
        """
        # Block IDs must be the same length for all bocks in the blob
        # and no greater than 64 characters. Given there is a limit of
        # 50000 blocks per blob we zero-pad the part_number to five
        # places.
        block_id = str(part_number).zfill(5)
        blob_client = self.container_client.get_blob_client(key)
        blob_client.stage_block(block_id, body, **self._extra_upload_args)
        return {"PartNumber": block_id}

    def _complete_multipart_upload(self, upload_metadata, key, parts):
        """
        Finish a "multipart upload" by committing all blocks in the blob.

        :param dict upload_metadata: Provider-specific metadata about the upload
          (not used in Azure)
        :param str key: The key to use in the cloud service
        :param parts: The list of block IDs for the blocks which compose this blob
        """
        blob_client = self.container_client.get_blob_client(key)
        block_list = [part["PartNumber"] for part in parts]
        extra_args = self._extra_upload_args.copy()
        if self.tags is not None:
            extra_args["tags"] = dict(self.tags)
        blob_client.commit_block_list(block_list, **extra_args)

    def _abort_multipart_upload(self, upload_metadata, key):
        """
        Abort the upload of a block blob

        The objective of this method is to clean up any dangling resources - in
        this case those resources are uncommitted blocks.

        :param dict upload_metadata: Provider-specific metadata about the upload
          (not used in Azure)
        :param str key: The key to use in the cloud service
        """
        # Ideally we would clean up uncommitted blocks at this point
        # however there is no way of doing that.
        # Uncommitted blocks will be discarded after 7 days or when
        # the blob is committed (if they're not included in the commit).
        # We therefore create an empty blob (thereby discarding all uploaded
        # blocks for that blob) and then delete it.
        blob_client = self.container_client.get_blob_client(key)
        blob_client.commit_block_list([], **self._extra_upload_args)
        blob_client.delete_blob()

    def delete_objects(self, paths):
        """
        Delete the objects at the specified paths

        :param List[str] paths:
        """
        try:
            # If paths is empty because the files have already been deleted then
            # delete_blobs will return successfully so we just call it with whatever
            # we were given
            responses = self.container_client.delete_blobs(*paths)
        except PartialBatchErrorException as exc:
            # Although the docs imply any errors will be returned in the response
            # object, in practice a PartialBatchErrorException is raised which contains
            # the response objects in its `parts` attribute.
            # We therefore set responses to reference the response in the exception and
            # treat it the same way we would a regular response.
            logging.warning(
                "PartialBatchErrorException received from Azure: %s" %
                exc.message)
            responses = exc.parts

        # resp is an iterator of HttpResponse objects so we check the status codes
        # which should all be 202 if successful
        errors = False
        for resp in responses:
            if resp.status_code == 404:
                logging.warning(
                    "Deletion of object %s failed because it could not be found"
                    % resp.request.url)
            elif resp.status_code != 202:
                errors = True
                logging.error(
                    'Deletion of object %s failed with error code: "%s"' %
                    (resp.request.url, resp.status_code))

        if errors:
            raise CloudProviderError(
                "Error from cloud provider while deleting objects - "
                "please check the Barman logs")
    .config(f"fs.azure.sas.{workspace_container}.hecdf.blob.core.windows.net", workspace_sas_token) \
    .getOrCreate()

## Define your blob services to access files on Azure Blob Storage
from azure.storage.blob import ContainerClient

testname = "koalas-tutorial/datasets/loan_preprocessed.csv"

account_url = "https://hecdf.blob.core.windows.net"

facts_blob_service = ContainerClient(account_url=account_url,
                                     container_name=facts_container,
                                     credential=facts_sas_token)
workspace_blob_service = ContainerClient(account_url=account_url,
                                         container_name=workspace_container,
                                         credential=workspace_sas_token)

# Create the parent folder
blobs = list(facts_blob_service.list_blobs())
for blob in blobs:
    from pathlib import Path
    Path(f'../../data/raw/{blob.name}').parent.mkdir(parents=True,
                                                     exist_ok=True)

    # From facts to your home directory
    with open(f"../../data/raw/{blob.name}", "wb") as data:
        download_stream = facts_blob_service.get_blob_client(
            blob.name).download_blob()
        data.write(download_stream.readall())

spark.stop()