def _get_s3_root(source_location, dir_op): # Obtain the bucket and the key. bucket, key = split_s3_bucket_key(source_location) if not dir_op and not key.endswith('/'): # If we are not performing an operation on a directory and the key # is of the form: ``prefix/key``. We only want ``prefix`` included in # the the s3 root and not ``key``. key = '/'.join(key.split('/')[:-1]) # Rejoin the bucket and key back together. s3_path = '/'.join([bucket, key]) return s3_path
def download(row, root): _, item = row s3 = boto3.client("s3") url = item.us_tif_url bucket_name, key_name = split_s3_bucket_key(url) fname = key_name.split("/")[-1] path = root / fname if path.is_file(): return s3.download_file(bucket_name, key_name, path.as_posix())
def download_url_to_file_path(file_path, file_url): print("Downloading file {0} from url {1}".format(file_path, file_url)) with open(file_path, "wb") as handle: if file_url.startswith("s3:"): s3_bucket, s3_key = split_s3_bucket_key(file_url) s3_client = boto3.client('s3') s3_client.download_fileobj(Fileobj=handle, Bucket=s3_bucket, Key=s3_key) else: response = requests.get(file_url) for data in tqdm(response.iter_content()): handle.write(data)
def dl_and_get_rect(row, root, mode): if mode == 'tier1': _, item = row s3 = boto3.client('s3') url = item.url bucket_name, key_name = split_s3_bucket_key(url) fname = key_name.split('/')[-1] s3.download_file(bucket_name, key_name, fname) x, y = get_rect(row, Path(''), '') Path(fname).unlink() return x, y return get_rect(row, root, mode)
def create_filter(parameters): """Given the CLI parameters dict, create a Filter object.""" # We need to evaluate all the filters based on the source # directory. if parameters['filters']: cli_filters = parameters['filters'] real_filters = [] for filter_type, filter_pattern in cli_filters: real_filters.append((filter_type.lstrip('-'), filter_pattern)) source_location = parameters['src'] if source_location.startswith('s3://'): # This gives us (bucket, keyname) and we want # the bucket to be the root dir. rootdir = split_s3_bucket_key(source_location)[0] else: if parameters.get('dir_op'): rootdir = os.path.abspath(parameters['src']) else: rootdir = os.path.abspath(os.path.dirname(parameters['src'])) return Filter(real_filters, rootdir) else: return Filter({}, None)
def get_data(s3_client, public_s3_data, to_bucket, to_prefix, sample_data=1): new_paths = [] for f in public_s3_data: bucket_name, key_name = split_s3_bucket_key(f) filename = f.split('/')[-1] new_path = "s3://{}/{}/{}".format(to_bucket, to_prefix, filename) new_paths.append(new_path) # only download if not already downloaded if not os.path.exists('./data/{}'.format(filename)): # download s3 data print("Downloading file from {}".format(f)) s3_client.download_file(bucket_name, key_name, './data/{}'.format(filename)) # subsample the data to create a smaller datatset for this demo new_df = pd.read_csv('./data/{}'.format(filename)) new_df = new_df.sample(frac=sample_data) new_df.to_csv('./data/{}'.format(filename), index=False) # upload s3 data to our default s3 bucket for SageMaker Studio print("Uploading {} to {}\n".format(filename, new_path)) s3_client.upload_file('./data/{}'.format(filename), to_bucket, os.path.join(to_prefix,filename)) return new_paths
def _get_s3_root(source_location): return split_s3_bucket_key(source_location)[0]