def copy_s3_folder_contents_to_new_folder(old_s3_folder_path,
                                          new_s3_folder_path):
    """
    Copies complete folder structure within old_s3_folder_path to the new_s3_folder_path 
    """
    old_s3_folder_path = add_slash(old_s3_folder_path)
    new_s3_folder_path = add_slash(new_s3_folder_path)

    all_old_filepaths = get_filepaths_from_s3_folder(old_s3_folder_path)
    for ofp in all_old_filepaths:
        nfp = ofp.replace(old_s3_folder_path, new_s3_folder_path)
        copy_s3_object(ofp, nfp)
def get_filepaths_from_s3_folder(s3_folder_path,
                                 extension=None,
                                 exclude_zero_byte_files=True):
    """
    Get a list of filepaths from a bucket. If extension is set to a string then only return files with that extension otherwise if set to None (default) all filepaths are returned.
    """
    if extension is None:
        extension = ""
    elif extension[0] != ".":
        extension = "." + extension

    s3_folder_path = add_slash(s3_folder_path)
    bucket, key = s3_path_to_bucket_key(s3_folder_path)

    s3b = s3_resource.Bucket(bucket)
    obs = s3b.objects.filter(Prefix=key)
    if exclude_zero_byte_files:
        ob_keys = [
            o.key for o in obs if o.key.endswith(extension) and o.size != 0
        ]
    else:
        ob_keys = [o.key for o in obs if o.key.endswith(extension)]
    paths = sorted([bucket_key_to_s3_path(bucket, o) for o in ob_keys])

    return paths
def delete_s3_folder_contents(s3_folder_path):
    """
    Deletes all files within the s3_folder_path given given.
    """
    s3_folder_path = add_slash(s3_folder_path)
    all_filepaths = get_filepaths_from_s3_folder(s3_folder_path,
                                                 exclude_zero_byte_files=False)
    for f in all_filepaths:
        delete_s3_object(f)
Example #4
0
csv_path = 's3://alpha-gluejobutils/testing/data/diamonds_csv/'
meta_path = 's3://alpha-gluejobutils/testing/meta_data/diamonds.json'

meta = datatypes.create_spark_schema_from_metadata_file(meta_path)
df_old = spark.read.csv(csv_path, header=True, schema=meta)
df_old = drd.init_record_datetimes(df_old,
                                   '2018-01-01 01:00:00',
                                   col_prefix="dea_record_")
df_old.write.mode('overwrite').parquet(
    's3://alpha-gluejobutils/database/table1/')

## =====================> UTILS MODULE TESTING <========================= ##
a = 'test/folder/path/'
b = 'test/folder/path'
if utils.add_slash(a) != a:
    raise ValueError('add_slash FAILURE')
if utils.remove_slash(a) != b:
    raise ValueError('remove_slash FAILURE')
if utils.add_slash(b) != a:
    raise ValueError('add_slash FAILURE')
if utils.remove_slash(b) != b:
    raise ValueError('remove_slash FAILURE')
print("===> utils ===> OK")

## =====================> S3 MODULE TESTING <========================= ##
bucket = 'alpha-gluejobutils'
diamonds_obj = 'testing/data/diamonds.csv'
### ### ### ### ### ### ### ###
### bucket_key_to_s3_path ###
### ### ### ### ### ### ### ###