def write_dicts_to_jsonl_gz(data, s3_path): file_as_string = json.dumps(data[0]) for d in data[1:]: file_as_string += '\n' file_as_string += json.dumps(d) b, k = s3_path_to_bucket_key(s3_path) compressed_out = gzip.compress(bytes(file_as_string, 'utf-8')) s3_resource.Object(b, k).put(Body=compressed_out)
def read_jsonl_from_s3(s3_path, encoding='utf-8', compressed=False) : """ read a jsonl file from an s3 path """ bucket, key = s3_path_to_bucket_key(s3_path) obj = s3_resource.Object(bucket, key) text = obj.get()['Body'].read() if compressed: split_text = gzip.decompress(text).decode(encoding).split('\n') else: split_text = text.decode(encoding).split('\n') data = [] for t in split_text: data.append(json.loads(t)) return data
raise ValueError('bucket_key_to_s3_path FAILURE') out = s3.bucket_key_to_s3_path(bucket, 'some/path/') if out != 's3://alpha-gluejobutils/some/path/': raise ValueError('bucket_key_to_s3_path FAILURE') out = s3.bucket_key_to_s3_path(bucket, 'some/path') if out != 's3://alpha-gluejobutils/some/path': raise ValueError('bucket_key_to_s3_path FAILURE') print("===> bucket_key_to_s3_path ===> OK") ### ### ### ### ### ### ### ### ### s3_path_to_bucket_key ### ### ### ### ### ### ### ### ### b, o = s3.s3_path_to_bucket_key( 's3://alpha-gluejobutils/testing/data/diamonds_csv/diamonds.csv') if b != 'alpha-gluejobutils' or o != 'testing/data/diamonds_csv/diamonds.csv': raise ValueError('s3_path_to_bucket_key FAILURE') b, o = s3.s3_path_to_bucket_key('s3://alpha-gluejobutils/testing/data') if b != 'alpha-gluejobutils' or o != 'testing/data': raise ValueError('s3_path_to_bucket_key FAILURE') b, o = s3.s3_path_to_bucket_key('s3://alpha-gluejobutils/testing/data/') if b != 'alpha-gluejobutils' or o != 'testing/data/': raise ValueError('s3_path_to_bucket_key FAILURE') print("===> s3_path_to_bucket_key ===> OK") ### ### ### ### ### ### ### ### read_json_from_s3 ### ### ### ### ### ### ### ###