def get_plain_text(bucket, key, size, compression, *, etag, s3_client, version_id): """get plain text object contents""" text = "" try: obj = retry_s3( "get", bucket, key, size, etag=etag, s3_client=s3_client, limit=ELASTIC_LIMIT_BYTES, version_id=version_id ) lines = get_preview_lines( obj["Body"], compression, ELASTIC_LIMIT_LINES, ELASTIC_LIMIT_BYTES ) text = '\n'.join(lines) except UnicodeDecodeError as ex: print(f"Unicode decode error in {key}", ex) return text
def test_txt_max_bytes(self): """test truncation to CATALOG_LIMIT_BYTES""" txt = BASE_DIR / 'two-line.txt' max_lines = 500 max_bytes = 5 with open(txt, 'rb') as file_obj: lines = get_preview_lines(iterate_chunks(file_obj), None, max_lines, max_bytes) assert len(lines) == 1, 'failed to truncate bytes' assert lines[0] == '1234😊', 'failed to truncate bytes'
def test_long_gz(self): """test a gzipped text file with lots of lines""" txt = BASE_DIR / 'long.txt.gz' max_lines = 500 max_bytes = 10000 with open(txt, 'rb') as file_obj: lines = get_preview_lines(iterate_chunks(file_obj), 'gz', max_lines, max_bytes) assert len(lines) == max_lines, 'unexpected number of lines' assert lines[0] == 'Line 1', 'unexpected first line' assert lines[-1] == f'Line {max_lines}', 'unexpected last line'
def test_txt_max_bytes_one_line(self): """test truncation to CATALOG_LIMIT_BYTES""" txt = BASE_DIR / 'one-line.txt' max_lines = 500 max_bytes = 8 chunk_size = 10 with open(txt, 'rb') as file_obj: lines = get_preview_lines(iterate_chunks(file_obj, chunk_size), None, max_lines, max_bytes) assert len(lines) == 1, 'failed to truncate bytes' assert lines[0] == '🚷🚯', 'failed to truncate bytes'
def lambda_handler(request): """ dynamically handle preview requests for bytes in S3 caller must specify input_type (since there may be no file extension) Returns: JSON response """ url = request.args['url'] input_type = request.args.get('input') compression = request.args.get('compression') separator = request.args.get('sep') or ',' exclude_output = request.args.get('exclude_output') == 'true' try: max_bytes = int(request.args.get('max_bytes', CATALOG_LIMIT_BYTES)) except ValueError as error: return make_json_response(400, { 'title': 'Unexpected max_bytes= value', 'detail': str(error) }) parsed_url = urlparse(url, allow_fragments=False) if not (parsed_url.scheme == 'https' and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX) and parsed_url.username is None and parsed_url.password is None): return make_json_response( 400, {'title': 'Invalid url=. Expected S3 virtual-host URL.'}) try: line_count = _str_to_line_count( request.args.get('line_count', str(CATALOG_LIMIT_LINES))) except ValueError as error: # format https://jsonapi.org/format/1.1/#error-objects return make_json_response(400, { 'title': 'Unexpected line_count= value', 'detail': str(error) }) # stream=True saves memory almost equal to file size resp = requests.get(url, stream=True) if resp.ok: content_iter = resp.iter_content(CHUNK) if input_type == 'csv': html, info = extract_csv( get_preview_lines(content_iter, compression, line_count, max_bytes), separator) elif input_type == 'excel': html, info = extract_excel(get_bytes(content_iter, compression)) elif input_type == 'fcs': html, info = extract_fcs(get_bytes(content_iter, compression)) elif input_type == 'ipynb': html, info = extract_ipynb(get_bytes(content_iter, compression), exclude_output) elif input_type == 'parquet': html, info = extract_parquet(get_bytes(content_iter, compression)) elif input_type == 'vcf': html, info = extract_vcf( get_preview_lines(content_iter, compression, line_count, max_bytes)) elif input_type in TEXT_TYPES: html, info = extract_txt( get_preview_lines(content_iter, compression, line_count, max_bytes)) else: assert False, f'unexpected input_type: {input_type}' assert isinstance(html, str), 'expected html parameter as string' assert isinstance(info, dict), 'expected info metadata to be a dict' ret_val = { 'info': info, 'html': html, } else: ret_val = { 'error': resp.reason, 'text': resp.text, } return make_json_response(resp.status_code, ret_val)