Esempio n. 1
0
def _get_s3_root(source_location, dir_op):
    # Obtain the bucket and the key.
    bucket, key = split_s3_bucket_key(source_location)
    if not dir_op and not key.endswith('/'):
        # If we are not performing an operation on a directory and the key
        # is of the form: ``prefix/key``. We only want ``prefix`` included in
        # the the s3 root and not ``key``.
        key = '/'.join(key.split('/')[:-1])
    # Rejoin the bucket and key back together.
    s3_path = '/'.join([bucket, key])
    return s3_path
Esempio n. 2
0
def _get_s3_root(source_location, dir_op):
    # Obtain the bucket and the key.
    bucket, key = split_s3_bucket_key(source_location)
    if not dir_op and not key.endswith('/'):
        # If we are not performing an operation on a directory and the key
        # is of the form: ``prefix/key``. We only want ``prefix`` included in
        # the the s3 root and not ``key``.
        key = '/'.join(key.split('/')[:-1])
    # Rejoin the bucket and key back together.
    s3_path = '/'.join([bucket, key])
    return s3_path
def download(row, root):
    _, item = row

    s3 = boto3.client("s3")

    url = item.us_tif_url
    bucket_name, key_name = split_s3_bucket_key(url)
    fname = key_name.split("/")[-1]
    path = root / fname
    if path.is_file():
        return

    s3.download_file(bucket_name, key_name, path.as_posix())
def download_url_to_file_path(file_path, file_url):

    print("Downloading file {0} from url {1}".format(file_path, file_url))
    with open(file_path, "wb") as handle:
        if file_url.startswith("s3:"):
            s3_bucket, s3_key = split_s3_bucket_key(file_url)
            s3_client = boto3.client('s3')
            s3_client.download_fileobj(Fileobj=handle,
                                       Bucket=s3_bucket,
                                       Key=s3_key)
        else:
            response = requests.get(file_url)
            for data in tqdm(response.iter_content()):
                handle.write(data)
def dl_and_get_rect(row, root, mode):
    if mode == 'tier1':
        _, item = row

        s3 = boto3.client('s3')

        url = item.url
        bucket_name, key_name = split_s3_bucket_key(url)
        fname = key_name.split('/')[-1]
        s3.download_file(bucket_name, key_name, fname)

        x, y = get_rect(row, Path(''), '')

        Path(fname).unlink()

        return x, y

    return get_rect(row, root, mode)
Esempio n. 6
0
def create_filter(parameters):
    """Given the CLI parameters dict, create a Filter object."""
    # We need to evaluate all the filters based on the source
    # directory.
    if parameters['filters']:
        cli_filters = parameters['filters']
        real_filters = []
        for filter_type, filter_pattern in cli_filters:
            real_filters.append((filter_type.lstrip('-'), filter_pattern))
        source_location = parameters['src']
        if source_location.startswith('s3://'):
            # This gives us (bucket, keyname) and we want
            # the bucket to be the root dir.
            rootdir = split_s3_bucket_key(source_location)[0]
        else:
            if parameters.get('dir_op'):
                rootdir = os.path.abspath(parameters['src'])
            else:
                rootdir = os.path.abspath(os.path.dirname(parameters['src']))
        return Filter(real_filters, rootdir)
    else:
        return Filter({}, None)
Esempio n. 7
0
def create_filter(parameters):
    """Given the CLI parameters dict, create a Filter object."""
    # We need to evaluate all the filters based on the source
    # directory.
    if parameters['filters']:
        cli_filters = parameters['filters']
        real_filters = []
        for filter_type, filter_pattern in cli_filters:
            real_filters.append((filter_type.lstrip('-'),
                                 filter_pattern))
        source_location = parameters['src']
        if source_location.startswith('s3://'):
            # This gives us (bucket, keyname) and we want
            # the bucket to be the root dir.
            rootdir = split_s3_bucket_key(source_location)[0]
        else:
            if parameters.get('dir_op'):
                rootdir = os.path.abspath(parameters['src'])
            else:
                rootdir = os.path.abspath(os.path.dirname(parameters['src']))
        return Filter(real_filters, rootdir)
    else:
        return Filter({}, None)
Esempio n. 8
0
def get_data(s3_client, public_s3_data, to_bucket, to_prefix, sample_data=1):
    new_paths = []
    for f in public_s3_data:
        bucket_name, key_name = split_s3_bucket_key(f)
        filename = f.split('/')[-1]
        new_path = "s3://{}/{}/{}".format(to_bucket, to_prefix, filename)
        new_paths.append(new_path)
        
        # only download if not already downloaded
        if not os.path.exists('./data/{}'.format(filename)):
            # download s3 data
            print("Downloading file from {}".format(f))
            s3_client.download_file(bucket_name, key_name, './data/{}'.format(filename))
    
        # subsample the data to create a smaller datatset for this demo
        new_df = pd.read_csv('./data/{}'.format(filename))
        new_df = new_df.sample(frac=sample_data)
        new_df.to_csv('./data/{}'.format(filename), index=False)
        
        # upload s3 data to our default s3 bucket for SageMaker Studio
        print("Uploading {} to {}\n".format(filename, new_path))
        s3_client.upload_file('./data/{}'.format(filename), to_bucket, os.path.join(to_prefix,filename))
        
    return new_paths
Esempio n. 9
0
def _get_s3_root(source_location):
    return split_s3_bucket_key(source_location)[0]
Esempio n. 10
0
def _get_s3_root(source_location):
    return split_s3_bucket_key(source_location)[0]