Exemple #1
0
    def test_bad_line_count(self):
        """send a known bad line_count parameter"""
        garbage = '-1'
        event = self._make_event(
            {
                'url': self.FILE_URL,
                'input': 'txt',
                'line_count': garbage
            }, {'origin': MOCK_ORIGIN})
        resp = index.lambda_handler(event, None)
        assert resp[
            'statusCode'] == 400, f'Expected 400 on event with line_count of {garbage}'
        body = json.loads(read_body(resp))
        assert 'Unexpected line_count=' in body[
            'title'], 'Expected 400 explanation'
        assert 'out of range' in body['detail'], 'Expected 400 explanation'

        garbage = '123notint'
        event = self._make_event(
            {
                'url': self.FILE_URL,
                'input': 'txt',
                'line_count': garbage
            }, {'origin': MOCK_ORIGIN})
        resp = index.lambda_handler(event, None)
        assert resp[
            'statusCode'] == 400, 'Expected 400 on event with line_count of 123notint'
        body = json.loads(read_body(resp))
        assert 'Unexpected line_count=' in body[
            'title'], 'Expected 400 explanation'
        assert 'invalid literal' in body['detail'], 'Expected 400 explanation'
Exemple #2
0
    def test_ipynb(self):
        """test sending ipynb bytes"""
        notebook = BASE_DIR / 'nb_1200727.ipynb'
        responses.add(responses.GET,
                      self.FILE_URL,
                      body=notebook.read_bytes(),
                      status=200)
        event = self._make_event({'url': self.FILE_URL, 'input': 'ipynb'})
        resp = index.lambda_handler(event, None)
        body = json.loads(read_body(resp))
        assert resp['statusCode'] == 200, 'preview failed on nb_1200727.ipynb'
        body_html = body['html']

        # neither lxml, nor py_w3c.validators.html.validator works to validate
        # these fragments; reasons include base64 encoded images, html entities, etc.
        # so we are going to trust nbconvert and just do some basic sanity checks
        # it is also the case that we (often) need to update nbconvert, and
        # HTML output changes version over version, so checking for exact HTML
        # is fragile
        assert body_html.count('<div') > 0, 'expected divs in ipynb HTML'
        assert body_html.count('<div') == body_html.count('</div>')
        assert body_html.count('<span') > 0, 'expected spans in ipynb HTML'
        assert body_html.count('<span') == body_html.count('</span>')
        # check for some strings we know should be in there
        assert 'SVD of Minute-Market-Data' in body_html, 'missing expected contents'
        assert 'Preprocessing' in body_html, 'missing expected contents'
        assert '<pre>[&#39;SEE&#39;, &#39;SE&#39;, &#39;SHW&#39;, &#39;SIG&#39;,' in body_html, \
            'Cell 3 output seems off'
        assert (
            '<span class="n">batch_size</span><span class="o">=</span><span class="mi">100</span>'
            '<span class="p">') in body_html, 'Last cell output missing'
Exemple #3
0
def test_generate_thumbnail(data_dir, input_file, thumb_size, expected_thumb,
                            expected_original_size, expected_thumb_size):
    # Resolve the input file path
    input_file = data_dir / input_file

    # Mock the request
    url = f"https://example.com/{input_file}"
    responses.add(responses.GET,
                  url=url,
                  body=input_file.read_bytes(),
                  status=200)

    # Create the lambda request event
    event = _make_event({"url": url, "size": thumb_size})

    # Get the response
    response = lambda_handler(event, None)

    # Assert the request was handled with no errors
    assert response["statusCode"] == 200

    # Parse the body / the returned thumbnail
    body = json.loads(read_body(response))

    # Assert basic metadata was fill properly
    assert body["info"]["original_size"] == expected_original_size
    assert body["info"]["thumbnail_size"] == expected_thumb_size

    # Assert the produced image is the same as the expected
    actual = AICSImage(base64.b64decode(body['thumbnail'])).reader.data
    expected = AICSImage(data_dir / expected_thumb).reader.data
    assert np.array_equal(actual, expected)
Exemple #4
0
    def test_tsv_quote(self):
        """test TSV from the glue NLP dataset"""
        csv = BASE_DIR / 'dev.tsv'
        responses.add(responses.GET,
                      self.FILE_URL,
                      body=csv.read_bytes(),
                      status=200)
        event = self._make_event({
            'url': self.FILE_URL,
            'input': 'csv',
            'sep': '\t'
        })
        resp = index.lambda_handler(event, None)
        body = json.loads(read_body(resp))
        assert resp['statusCode'] == 200, f'preview failed on {csv}'

        body_html = body['html']
        assert "<td>While dioxin levels in the environment were up" in body_html,\
            "missing expected cell"
        assert "<td>In Soviet times the Beatles ' music \" was cons...</td>" in body_html,\
            "missing expected cell"

        warnings = body['info']['warnings']
        assert warnings, f"expected warnings when parsing {csv}"
        assert warnings.count(
            "Skipping line") == 43, "expected to skip 43 lines"
Exemple #5
0
 def test_vcf_gz_partial(self):
     """test previewing part of a gzipped file
     we _should_ read 4 whole chunks and one partial one;
     and the preview endpoint should truncate to the last whole line
     """
     vcf = BASE_DIR / 'example.vcf.gz'
     assert os.path.getsize(
         vcf) > 128 * 5, 'not testing partial file decode'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=vcf.read_bytes(),
                   status=200)
     event = self._make_event({
         'url': self.FILE_URL,
         'input': 'vcf',
         'compression': 'gz'
     })
     # test partial decode
     resp = index.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp[
         'statusCode'] == 200, 'preview failed on example.vcf.gz, partial decode'
     data = body['info']['data']
     assert not data['data'], 'partial decode; did not expect any data'
     assert not data['header'], 'partial decode; did not expect a header'
     assert data['meta'][0] == '##fileformat=VCFv4.0', 'bad first meta line'
     assert data['meta'][-1].startswith('##FILTER=<'), 'bad last meta line'
     assert data['meta'][-1].endswith(
         'samples have data">'), 'bad last meta line'
     meta = body['info']['metadata']
     assert meta['variant_count'] == 0, 'expected no variants'
     assert not body['info']['metadata']['variants'], 'expected no variants'
Exemple #6
0
 def test_tsv_as_csv(self):
     """test returning HTML previews of mislabeled or problematic CSVs (via pandas)"""
     csv = BASE_DIR / 'tsv_mixed_types.csv'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=csv.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'csv'})
     resp = index.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, f'preview failed on {csv}'
     body_html = body['html']
     assert body_html.count('<table') == 1, 'expected one HTML table'
     assert body_html.count('</table>') == 1, 'expected one HTML table'
     assert body_html.count('<thead>') == 1, 'expected one HTML table head'
     assert body_html.count('</thead>') == 1, 'expected one HTML table head'
     assert body_html.count('<p>') == body_html.count(
         '</p>'), 'malformed HTML'
     assert '<td>Taiwan Strait, Taiwan (general), Taiwan</td>' in body_html, \
         'Missing a cell on the Taiwan Strait'
     assert not re.match(r'\d+ rows × \d+ columns', body_html), \
         'table dimensions should be removed'
     with open(BASE_DIR /
               'tsv_mixed_types_html_response_head.txt') as expected:
         head = expected.read()
         assert head in body_html, 'unexpected first columns'
Exemple #7
0
 def test_tsv(self):
     """test returning HTML previews of TSV (via pandas)"""
     csv = BASE_DIR / 'avengers.tsv'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=csv.read_bytes(),
                   status=200)
     event = self._make_event({
         'url': self.FILE_URL,
         'input': 'csv',
         'sep': '\t'
     })
     resp = index.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, f'preview failed on {csv}'
     body_html = body['html']
     assert body_html.count('<table') == 1, 'expected one HTML table'
     assert body_html.count('</table>') == 1, 'expected one HTML table'
     assert body_html.count('<thead>') == 1, 'expected one HTML table head'
     assert body_html.count('</thead>') == 1, 'expected one HTML table head'
     assert body_html.count('<p>') == body_html.count(
         '</p>'), 'malformed HTML'
     assert '<td>Nicholas Fury, Jr., Marcus Johnson</td>' in body_html, \
         'Expected Nick to be an Avenger'
     assert not re.match(r'\d+ rows × \d+ columns', body_html), \
         'table dimensions should be removed'
     with open(BASE_DIR / 'tsv_html_response_head.txt') as expected:
         head = expected.read()
         assert head in body_html, 'unexpected first columns'
Exemple #8
0
 def test_vcf(self):
     """test sending vcf bytes"""
     vcf = BASE_DIR / 'example.vcf'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=vcf.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'vcf'})
     resp = index.lambda_handler(event, None)
     assert resp['statusCode'] == 200, 'preview failed on example.vcf'
     _check_vcf(read_body(resp))
Exemple #9
0
 def test_bad_hostname(self):
     bad_url = 'https://example.com/foo'
     event = self._make_event({
         'url': bad_url,
         'input': 'txt'
     }, {'origin': MOCK_ORIGIN})
     resp = index.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 400, 'Expected 400 on event with a non-S3 URL'
     body = json.loads(read_body(resp))
     assert 'S3' in body['title'], 'Expected 400 explanation'
Exemple #10
0
    def test_folder_view_paging(self):
        """
        End-to-end test (top-level folder view with a limit & offset)
        """
        bucket = "bucket"
        key = ".quilt/packages/manifest_hash"
        params = dict(
            bucket=bucket,
            manifest=key,
            action="dir",
            params={
                "path": "paging_test/",
                "limit": 10,
                "offset": 10,
            },
        )

        expected_args = {
            'Bucket': bucket,
            'Key': key,
            'Expression': "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s",
            'ExpressionType': 'SQL',
            'InputSerialization': {
                'CompressionType': 'NONE',
                'JSON': {'Type': 'LINES'}
                },
            'OutputSerialization': {'JSON': {'RecordDelimiter': '\n'}},
        }

        paging_logical_keys = [
            f"f{i:03d}.csv" for i in range(1000)
        ]
        s3response_paging = self.make_manifest_query(paging_logical_keys)

        mock_s3 = boto3.client('s3')
        with patch.object(
            mock_s3,
            'select_object_content',
            side_effect=[
                s3response_paging,
                self.s3response_meta
            ]
        ) as client_patch, patch(
            'boto3.Session.client',
            return_value=mock_s3
        ):
            response = pkgselect.lambda_handler(params, None)
            print(response)
            folder = json.loads(read_body(response))['result']
            assert len(folder['prefixes']) == 0
            assert len(folder['objects']) == 10
            assert folder['total'] == 1000
            assert folder['objects'][0]['logical_key'] == 'f010.csv'
Exemple #11
0
    def test_fcs(self):
        """test fcs extraction
        for extended testing you can download FCS files here
        https://flowrepository.org/experiments/4/download_ziped_files,
        copy to data/fcs/ and run this unit test
        """
        parent = BASE_DIR / "fcs"
        fcs_files = list(parent.glob("*.fcs"))
        extended = False
        if (set(os.path.split(f)[1] for f in fcs_files) != set(
            ['accuri-ao1.fcs', 'bad.fcs', '3215apc 100004.fcs'])):
            extended = True
        first = True
        for fcs in fcs_files:
            _, name = os.path.split(fcs)
            file_bytes = fcs.read_bytes()
            if first:
                responses.add(
                    responses.GET,
                    self.FILE_URL,
                    body=file_bytes,
                    status=200,
                )
                first = False
            else:
                responses.replace(
                    responses.GET,
                    self.FILE_URL,
                    body=file_bytes,
                    status=200,
                )

            event = self._make_event({'url': self.FILE_URL, 'input': 'fcs'})
            resp = index.lambda_handler(event, None)
            assert resp[
                'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
            body = json.loads(read_body(resp))
            assert 'info' in body
            if 'warnings' not in body['info']:
                if not extended:
                    assert name == 'accuri-ao1.fcs'
                assert body['html'].startswith('<div>')
                assert body['html'].endswith('</div>')
                assert body['info']['metadata'].keys()
            else:
                assert not body['html']
                if 'metadata' not in body['info']:
                    assert body['info']['warnings'].startswith('Unable')
                    if not extended:
                        assert name == 'bad.fcs'
                else:
                    if not extended:
                        assert name == '3215apc 100004.fcs'
Exemple #12
0
    def test_empty_manifest(self):
        """
        End-to-end test (folder view without a prefix) for an
        empty package manifest
        """
        bucket = "bucket"
        key = ".quilt/packages/manifest_hash"
        params = dict(bucket=bucket,
                      manifest=key,
                      access_key="TESTKEY",
                      secret_key="TESTSECRET",
                      session_token="TESTSESSION")

        expected_args = {
            'Bucket': bucket,
            'Key': key,
            'Expression':
            "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s",
            'ExpressionType': 'SQL',
            'InputSerialization': {
                'CompressionType': 'NONE',
                'JSON': {
                    'Type': 'LINES'
                }
            },
            'OutputSerialization': {
                'JSON': {
                    'RecordDelimiter': '\n'
                }
            },
        }

        # Empty manifest
        jsonl = '{"version": "v0", "message": null}'
        streambytes = jsonl.encode()
        non_string_s3response = self.make_s3response(streambytes)

        mock_s3 = boto3.client('s3')
        with patch.object(mock_s3,
                          'select_object_content',
                          side_effect=[
                              non_string_s3response, self.s3response_meta
                          ]) as client_patch, patch('boto3.Session.client',
                                                    return_value=mock_s3):
            response = lambda_handler(self._make_event(params), None)
            print(response)
            assert response['statusCode'] == 200
            folder = json.loads(read_body(response))['contents']
            assert not folder['prefixes']
            assert not folder['objects']
            assert folder['total'] == 0
Exemple #13
0
def test_generate_thumbnail(
        data_dir,
        input_file,
        params,
        expected_thumb,
        expected_original_size,
        expected_thumb_size,
        num_pages,
        status
):
    # don't actually modify the environment in tests
    with patch.object(index, 'set_pdf_env', return_value=None) as set_env:
        # Resolve the input file path
        input_file = data_dir / input_file
        # Mock the request
        url = f"https://example.com/{input_file}"
        responses.add(
            responses.GET,
            url=url,
            body=input_file.read_bytes(),
            status=200
        )
        # Create the lambda request event
        event = _make_event({"url": url, **params})
        # Get the response
        response = index.lambda_handler(event, None)
        # Assert the request was handled with no errors
        assert response["statusCode"] == 200, f"response: {response}"
        # only check the body and expected image if it's a successful call
        # Parse the body / the returned thumbnail
        body = json.loads(read_body(response))
        # Assert basic metadata was filled properly
        assert body["info"]["thumbnail_size"] == expected_thumb_size
        if expected_original_size:  # PDFs don't have an expected size
            assert body["info"]["original_size"] == expected_original_size
        if "countPages" in params:
            assert body["info"]["page_count"] == num_pages
        # Assert the produced image is the same as the expected
        if params.get('input') == 'pdf':
            actual = Image.open(BytesIO(base64.b64decode(body['thumbnail'])))
            expected = Image.open(data_dir / expected_thumb)
            actual_array = np.array(actual)
            expected_array = np.array(expected)
            assert set_env.call_count == 1
            assert actual_array.shape == expected_array.shape
            assert np.allclose(expected_array, actual_array, atol=15, rtol=0.1)
        else:
            actual = AICSImage(base64.b64decode(body['thumbnail'])).reader.data
            expected = AICSImage(data_dir / expected_thumb).reader.data
            assert np.array_equal(actual, expected)
Exemple #14
0
 def test_bad_max_bytes(self):
     """send a known bad max_bytes parameter"""
     garbage = 'gfgfgf'
     event = self._make_event(
         {
             'url': self.FILE_URL,
             'input': 'txt',
             'max_bytes': garbage
         }, {'origin': MOCK_ORIGIN})
     resp = index.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 400, f'Expected 400 on event with line_count of {garbage}'
     body = json.loads(read_body(resp))
     assert 'Unexpected max_bytes=' in body[
         'title'], 'Expected 400 explanation'
Exemple #15
0
 def test_parquet_empty(self):
     """test a parquet file with columns but no rows"""
     parquet = BASE_DIR / 'onlycolumns-c000'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=parquet.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'})
     resp = index.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
     body = json.loads(read_body(resp))
     assert '<th>column_a</th>' in body['html'], 'Missing column_a'
     assert '<th>column_k</th>' in body['html'], 'Missing column_k'
     assert '<th>column_z</th>' in body['html'], 'Missing column_z'
Exemple #16
0
def test_generate_thumbnail(data_dir, input_file, params, expected_thumb,
                            expected_original_size, expected_thumb_size,
                            num_pages, status):
    # Resolve the input file path
    input_file = data_dir / input_file
    # Mock the request
    url = f"https://example.com/{input_file}"
    responses.add(responses.GET,
                  url=url,
                  body=input_file.read_bytes(),
                  status=200)
    # Create the lambda request event
    event = _make_event({"url": url, **params})
    # Get the response
    if expected_thumb == "I16-mode-128-fallback.png":
        # Note that if this set of params fails, it may be that better resamplers
        # have been added for this mode, and either the image or test will need
        # to be updated.
        with _mock(t4_lambda_thumbnail, '_convert_I16_to_L', Image.fromarray):
            response = t4_lambda_thumbnail.lambda_handler(event, None)
    else:
        response = t4_lambda_thumbnail.lambda_handler(event, None)

    # Assert the request was handled with no errors
    assert response["statusCode"] == 200, f"response: {response}"
    # only check the body and expected image if it's a successful call
    # Parse the body / the returned thumbnail
    body = read_body(response)
    # Assert basic metadata was filled properly
    info = json.loads(response["headers"][QUILT_INFO_HEADER])
    assert info["thumbnail_size"] == expected_thumb_size
    if expected_original_size:  # PDFs don't have an expected size
        assert info["original_size"] == expected_original_size
    if "countPages" in params:
        assert info["page_count"] == num_pages
    # Assert the produced image is the same as the expected
    if params.get('input') in ('pdf', "pptx"):
        actual = Image.open(BytesIO(body))
        expected = Image.open(data_dir / expected_thumb)
        actual_array = np.array(actual)
        expected_array = np.array(expected)
        assert actual_array.shape == expected_array.shape
        assert np.allclose(expected_array, actual_array, atol=15, rtol=0.1)
    else:
        actual = AICSImage(body)
        expected = AICSImage(data_dir / expected_thumb)
        assert actual.size() == expected.size()
        assert np.array_equal(actual.reader.data, expected.reader.data)
Exemple #17
0
 def test_excel(self):
     """test parsing excel files in S3"""
     workbook = BASE_DIR / 'sample.xlsx'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=workbook.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'excel'})
     resp = index.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview failed on sample.xlsx'
     body_html = body['html']
     assert body_html.count('Germany') == 13, 'unexpected data contents'
     assert body_html.count('Enterprise') == 7, 'unexpected data contents'
     assert body_html.count('Midmarket') == 13, 'unexpected data contents'
     assert body_html.count('Canada') == 9, 'unexpected data contents'
Exemple #18
0
 def test_ipynb_chop(self):
     """test that we eliminate output cells when we're in danger of breaking
     Lambda's invocation limit"""
     notebook = BASE_DIR / 'nb_1200727.ipynb'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=notebook.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'ipynb'})
     resp = index.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview failed on nb_1200727.ipynb'
     body_html = body['html']
     # isclose bc string sizes differ, e.g. on Linux
     assert math.isclose(len(body_html), 18084,
                         abs_tol=200), "Hmm, didn't chop nb_1200727.ipynb"
Exemple #19
0
    def test_folder_view(self):
        """
        End-to-end test (folder view without a prefix)
        """
        bucket = "bucket"
        key = ".quilt/packages/manifest_hash"
        params = dict(bucket=bucket,
                      manifest=key,
                      access_key="TESTKEY",
                      secret_key="TESTSECRET",
                      session_token="TESTSESSION")

        expected_args = {
            'Bucket': bucket,
            'Key': key,
            'Expression':
            "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s",
            'ExpressionType': 'SQL',
            'InputSerialization': {
                'CompressionType': 'NONE',
                'JSON': {
                    'Type': 'LINES'
                }
            },
            'OutputSerialization': {
                'JSON': {
                    'RecordDelimiter': '\n'
                }
            },
        }

        mock_s3 = boto3.client('s3')
        client_patch = patch.object(
            mock_s3,
            'select_object_content',
            side_effect=[self.s3response, self.s3response_meta])
        client_patch.start()
        with patch('boto3.Session.client', return_value=mock_s3):
            response = lambda_handler(self._make_event(params), None)
            print(response)
            assert response['statusCode'] == 200
            folder = json.loads(read_body(response))['contents']
            assert len(folder['prefixes']) == 1
            assert len(folder['objects']) == 1
            assert folder['objects'][0]['logical_key'] == 'foo.csv'
            assert folder['prefixes'][0]['logical_key'] == 'bar/'
        client_patch.stop()
Exemple #20
0
 def test_parquet(self):
     """test sending parquet bytes"""
     parquet = BASE_DIR / 'atlantic_storms.parquet'
     info_response = BASE_DIR / 'parquet_info_response.json'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=parquet.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'})
     resp = index.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
     body = json.loads(read_body(resp))
     with open(info_response, 'r') as info_json:
         expected = json.load(info_json)
     assert (body['info'] == expected), \
         f'Unexpected body["info"] for {parquet}'
Exemple #21
0
 def test_txt_short(self):
     """test sending txt bytes"""
     txt = BASE_DIR / 'short.txt'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=txt.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'txt'})
     resp = index.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview lambda failed on short.txt'
     headlist = body['info']['data']['head']
     assert len(headlist) == 98, 'unexpected number of lines head'
     assert headlist[0] == 'Line 1', 'unexpected first line in head'
     assert headlist[97] == 'Line 98', 'unexpected last line in head'
     taillist = body['info']['data']['tail']
     assert not taillist, 'expected empty tail'
Exemple #22
0
    def test_detail_view(self):
        """
        End-to-end test (detail view)
        """
        bucket = "bucket"
        key = ".quilt/packages/manifest_hash"
        logical_key = "bar/file1.txt"
        params = dict(bucket=bucket,
                      manifest=key,
                      logical_key=logical_key,
                      access_key="TESTKEY",
                      secret_key="TESTSECRET",
                      session_token="TESTSESSION")

        expected_sql = "SELECT s.* FROM s3object s WHERE s.logical_key = 'bar/file1.txt' LIMIT 1"
        expected_args = {
            'Bucket': bucket,
            'Key': key,
            'Expression':
            "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s",
            'ExpressionType': 'SQL',
            'InputSerialization': {
                'CompressionType': 'NONE',
                'JSON': {
                    'Type': 'LINES'
                }
            },
            'OutputSerialization': {
                'JSON': {
                    'RecordDelimiter': '\n'
                }
            },
        }

        mock_s3 = boto3.client('s3')
        client_patch = patch.object(mock_s3,
                                    'select_object_content',
                                    return_value=self.s3response_detail)
        client_patch.start()
        with patch('boto3.Session.client', return_value=mock_s3):
            response = lambda_handler(self._make_event(params), None)
            print(response)
            assert response['statusCode'] == 200
            json.loads(read_body(response))['contents']
        client_patch.stop()
Exemple #23
0
 def test_max_bytes(self):
     """test max bytes"""
     txt = BASE_DIR / 'short.txt'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=txt.read_bytes(),
                   status=200)
     event = self._make_event({
         'url': self.FILE_URL,
         'input': 'txt',
         'max_bytes': '3'
     })
     resp = index.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview lambda failed on short.txt'
     headlist = body['info']['data']['head']
     assert len(headlist) == 1, 'unexpected number of lines head'
     assert headlist[0] == 'Line', 'unexpected first line in head'
Exemple #24
0
 def test_csv(self):
     """test returning HTML previews of CSV (via pandas)"""
     csv = BASE_DIR / 'sample.csv'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=csv.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'csv'})
     resp = index.lambda_handler(event, None)
     body = json.loads(read_body(resp))
     assert resp['statusCode'] == 200, 'preview failed on sample.csv'
     body_html = body['html']
     assert body_html.count('<table') == 1, 'expected one HTML table'
     assert body_html.count('</table>') == 1, 'expected one HTML table'
     assert body_html.count('<p>') == body_html.count(
         '</p>'), 'malformed HTML'
     assert not re.match(r'\d+ rows × \d+ columns', body_html), \
         'table dimensions should be removed'
     with open(BASE_DIR / 'csv_html_response_head.txt') as expected:
         head = expected.read()
         assert head in body_html, 'unexpected first columns'
Exemple #25
0
 def test_parquet_no_pandas(self):
     """test sending parquet bytes, but with a different metadata format"""
     parquet = BASE_DIR / 'parquet_no_pandas.snappy.parquet'
     responses.add(responses.GET,
                   self.FILE_URL,
                   body=parquet.read_bytes(),
                   status=200)
     event = self._make_event({'url': self.FILE_URL, 'input': 'parquet'})
     resp = index.lambda_handler(event, None)
     assert resp[
         'statusCode'] == 200, f'Expected 200, got {resp["statusCode"]}'
     body = json.loads(read_body(resp))
     # open file and check body return against parquet metadata
     pf = pq.ParquetFile(parquet)
     assert all(f'<th>{col}</th>' in body['html'] for col in pf.schema.names), \
         'missing a column header in the preview'
     assert body['html'].count('<') > 0, 'expected tags in HTML'
     assert body['html'].count('<') == body['html'].count('>'), \
         'unmatched HTML tags'
     assert set(pf.schema.names) == set(body['info']['schema']['names']), \
         'unexpected difference of columns'
Exemple #26
0
    def test_folder_view(self):
        """
        End-to-end test (folder view without a prefix)
        """
        bucket = "bucket"
        key = ".quilt/packages/manifest_hash"
        params = dict(
            bucket=bucket,
            manifest=key,
            action="dir",
        )

        expected_args = {
            'Bucket': bucket,
            'Key': key,
            'Expression': "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s",
            'ExpressionType': 'SQL',
            'InputSerialization': {
                'CompressionType': 'NONE',
                'JSON': {'Type': 'LINES'}
                },
            'OutputSerialization': {'JSON': {'RecordDelimiter': '\n'}},
        }

        mock_s3 = boto3.client('s3')
        with patch.object(
            mock_s3,
            'select_object_content',
            side_effect=[
                self.s3response,
                self.s3response_meta,
            ]
        ) as client_patch, patch('boto3.Session.client', return_value=mock_s3):
            response = pkgselect.lambda_handler(params, None)
            print(response)
            folder = json.loads(read_body(response))['result']
            assert len(folder['prefixes']) == 1
            assert len(folder['objects']) == 1
            assert folder['objects'][0]['logical_key'] == 'foo.csv'
            assert folder['prefixes'][0]['logical_key'] == 'bar/'
Exemple #27
0
    def test_detail_view(self):
        """
        End-to-end test (detail view)
        """
        bucket = "bucket"
        key = ".quilt/packages/manifest_hash"
        logical_key = "bar/file1.txt"
        params = dict(
            bucket=bucket,
            manifest=key,
            action="file",
            params={"path": logical_key},
        )

        expected_sql = "SELECT s.* FROM s3object s WHERE s.logical_key = 'bar/file1.txt' LIMIT 1"
        expected_args = {
            'Bucket': bucket,
            'Key': key,
            'Expression': "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s",
            'ExpressionType': 'SQL',
            'InputSerialization': {
                'CompressionType': 'NONE',
                'JSON': {'Type': 'LINES'}
                },
            'OutputSerialization': {'JSON': {'RecordDelimiter': '\n'}},
        }

        mock_s3 = boto3.client('s3')
        with patch.object(
                mock_s3,
                'select_object_content',
                return_value=self.s3response_detail
        ) as client_patch, patch(
            'boto3.Session.client',
            return_value=mock_s3
        ):
            response = pkgselect.lambda_handler(params, None)
            print(response)
            json.loads(read_body(response))['result']
Exemple #28
0
    def test_anon_access(self):
        """
        Test anonymous call w/ ALLOW_ANONYMOUS_ACCESS
        """
        bucket = "bucket"
        key = ".quilt/packages/manifest_hash"
        params = dict(
            bucket=bucket,
            manifest=key,
        )

        expected_args = {
            'Bucket': bucket,
            'Key': key,
            'Expression': "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s",
            'ExpressionType': 'SQL',
            'InputSerialization': {
                'CompressionType': 'NONE',
                'JSON': {'Type': 'LINES'}
                },
            'OutputSerialization': {'JSON': {'RecordDelimiter': '\n'}},
        }

        env_patcher = patch.dict(os.environ, {
            'AWS_ACCESS_KEY_ID': 'test_key',
            'AWS_SECRET_ACCESS_KEY': 'test_secret',
            'ALLOW_ANONYMOUS_ACCESS': '1'
        })
        env_patcher.start()

        mock_s3 = boto3.client('s3')
        client_patch = patch.object(
            mock_s3,
            'select_object_content',
            side_effect=[
                self.s3response,
                self.s3response_meta
            ]
        )
        client_patch.start()
        response = {
            'ETag': '12345',
            'VersionId': '1.0',
            'ContentLength': 123,
        }
        expected_params = {
            'Bucket': bucket,
            'Key': key,
        }
        s3_stubber = Stubber(mock_s3)
        s3_stubber.activate()
        s3_stubber.add_response('head_object', response, expected_params)
        with patch('boto3.Session.client', return_value=mock_s3):
            response = lambda_handler(self._make_event(params), None)
            print(response)
            assert response['statusCode'] == 200
            folder = json.loads(read_body(response))['contents']
            print(folder)
            assert len(folder['prefixes']) == 1
            assert len(folder['objects']) == 1
            assert folder['objects'][0]['logical_key'] == 'foo.csv'
            assert folder['prefixes'][0]['logical_key'] == 'bar/'
        s3_stubber.deactivate()
        client_patch.stop()
        env_patcher.stop()
Exemple #29
0
    def test_non_string_keys(self):
        """
        End-to-end test (folder view without a prefix)
        """
        bucket = "bucket"
        key = ".quilt/packages/manifest_hash"
        params = dict(
            bucket=bucket,
            manifest=key,
            action="dir",
        )

        expected_args = {
            'Bucket': bucket,
            'Key': key,
            'Expression': "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s",
            'ExpressionType': 'SQL',
            'InputSerialization': {
                'CompressionType': 'NONE',
                'JSON': {'Type': 'LINES'}
                },
            'OutputSerialization': {'JSON': {'RecordDelimiter': '\n'}},
        }

        # Return a response with keys that are not strings (integers here)
        # The important test case is where all members of a column are
        # non-string
        logical_keys = [
            "1",
            "2",
            "3",
        ]
        entries = []
        for key in logical_keys:
            entry = dict(
                logical_key=key,
                physical_key=key,
                size=100
            )
            entries.append(json.dumps(entry))
        jsonl = "\n".join(entries)
        streambytes = jsonl.encode()
        non_string_s3response = self.make_s3response(streambytes)

        mock_s3 = boto3.client('s3')
        with patch.object(
            mock_s3,
            'select_object_content',
            side_effect=[
                non_string_s3response,
                self.s3response_meta
            ]
        ) as client_patch, patch(
            'boto3.Session.client',
            return_value=mock_s3
        ):
            response = pkgselect.lambda_handler(params, None)
            print(response)
            folder = json.loads(read_body(response))['result']
            assert not folder['prefixes']
            assert len(folder['objects']) == 3
            assert folder['objects'][0]['logical_key'] == '1'
            assert folder['objects'][1]['logical_key'] == '2'
            assert folder['objects'][2]['logical_key'] == '3'