Beispiel #1
0
def get_plain_text(bucket, key, size, compression, *, etag, s3_client, version_id):
    """get plain text object contents"""
    text = ""
    try:
        obj = retry_s3(
            "get",
            bucket,
            key,
            size,
            etag=etag,
            s3_client=s3_client,
            limit=ELASTIC_LIMIT_BYTES,
            version_id=version_id
        )
        lines = get_preview_lines(
            obj["Body"],
            compression,
            ELASTIC_LIMIT_LINES,
            ELASTIC_LIMIT_BYTES
        )
        text = '\n'.join(lines)
    except UnicodeDecodeError as ex:
        print(f"Unicode decode error in {key}", ex)

    return text
Beispiel #2
0
 def test_txt_max_bytes(self):
     """test truncation to CATALOG_LIMIT_BYTES"""
     txt = BASE_DIR / 'two-line.txt'
     max_lines = 500
     max_bytes = 5
     with open(txt, 'rb') as file_obj:
         lines = get_preview_lines(iterate_chunks(file_obj), None, max_lines, max_bytes)
     assert len(lines) == 1, 'failed to truncate bytes'
     assert lines[0] == '1234😊', 'failed to truncate bytes'
Beispiel #3
0
    def test_long_gz(self):
        """test a gzipped text file with lots of lines"""
        txt = BASE_DIR / 'long.txt.gz'
        max_lines = 500
        max_bytes = 10000
        with open(txt, 'rb') as file_obj:
            lines = get_preview_lines(iterate_chunks(file_obj), 'gz', max_lines, max_bytes)

        assert len(lines) == max_lines, 'unexpected number of lines'
        assert lines[0] == 'Line 1', 'unexpected first line'
        assert lines[-1] == f'Line {max_lines}', 'unexpected last line'
Beispiel #4
0
 def test_txt_max_bytes_one_line(self):
     """test truncation to CATALOG_LIMIT_BYTES"""
     txt = BASE_DIR / 'one-line.txt'
     max_lines = 500
     max_bytes = 8
     chunk_size = 10
     with open(txt, 'rb') as file_obj:
         lines = get_preview_lines(iterate_chunks(file_obj, chunk_size),
                                   None, max_lines, max_bytes)
     assert len(lines) == 1, 'failed to truncate bytes'
     assert lines[0] == '🚷🚯', 'failed to truncate bytes'
Beispiel #5
0
def lambda_handler(request):
    """
    dynamically handle preview requests for bytes in S3
    caller must specify input_type (since there may be no file extension)

    Returns:
        JSON response
    """
    url = request.args['url']
    input_type = request.args.get('input')
    compression = request.args.get('compression')
    separator = request.args.get('sep') or ','
    exclude_output = request.args.get('exclude_output') == 'true'
    try:
        max_bytes = int(request.args.get('max_bytes', CATALOG_LIMIT_BYTES))
    except ValueError as error:
        return make_json_response(400, {
            'title': 'Unexpected max_bytes= value',
            'detail': str(error)
        })

    parsed_url = urlparse(url, allow_fragments=False)
    if not (parsed_url.scheme == 'https'
            and parsed_url.netloc.endswith(S3_DOMAIN_SUFFIX)
            and parsed_url.username is None and parsed_url.password is None):
        return make_json_response(
            400, {'title': 'Invalid url=. Expected S3 virtual-host URL.'})

    try:
        line_count = _str_to_line_count(
            request.args.get('line_count', str(CATALOG_LIMIT_LINES)))
    except ValueError as error:
        # format https://jsonapi.org/format/1.1/#error-objects
        return make_json_response(400, {
            'title': 'Unexpected line_count= value',
            'detail': str(error)
        })

    # stream=True saves memory almost equal to file size
    resp = requests.get(url, stream=True)
    if resp.ok:
        content_iter = resp.iter_content(CHUNK)
        if input_type == 'csv':
            html, info = extract_csv(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes), separator)
        elif input_type == 'excel':
            html, info = extract_excel(get_bytes(content_iter, compression))
        elif input_type == 'fcs':
            html, info = extract_fcs(get_bytes(content_iter, compression))
        elif input_type == 'ipynb':
            html, info = extract_ipynb(get_bytes(content_iter, compression),
                                       exclude_output)
        elif input_type == 'parquet':
            html, info = extract_parquet(get_bytes(content_iter, compression))
        elif input_type == 'vcf':
            html, info = extract_vcf(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes))
        elif input_type in TEXT_TYPES:
            html, info = extract_txt(
                get_preview_lines(content_iter, compression, line_count,
                                  max_bytes))
        else:
            assert False, f'unexpected input_type: {input_type}'

        assert isinstance(html, str), 'expected html parameter as string'
        assert isinstance(info, dict), 'expected info metadata to be a dict'

        ret_val = {
            'info': info,
            'html': html,
        }
    else:
        ret_val = {
            'error': resp.reason,
            'text': resp.text,
        }

    return make_json_response(resp.status_code, ret_val)