Beispiel #1
0
def test_get_many(s3root, prefixes, expected):
    def iter_objs(urls, objs):
        for url in urls:
            obj = objs[url]
            for r, expected in obj.items():
                if r is None:
                    yield url, None, None
                else:
                    yield url, expected.range.req_offset, expected.range.req_size

    with S3() as s3:
        # 1) test the non-missing case

        # to test result ordering, make sure we are requesting
        # keys in a non-lexicographic order
        not_missing_urls = [
            k for k, v in expected.items() if v[None].size is not None
        ]
        urls_in_order = list(sorted(not_missing_urls, reverse=True))
        ranges_in_order = []
        for url in urls_in_order:
            ranges_in_order.extend(v.range for v in expected[url].values())

        objs_in_order = list(
            starmap(S3GetObject, iter_objs(urls_in_order, expected)))
        s3objs = s3.get_many(list(objs_in_order), return_info=True)

        fetched_urls = []
        for url in urls_in_order:
            fetched_urls.extend([url] * len(expected[url]))
        # results should come out in the order of keys requested
        assert fetched_urls == [e.url for e in s3objs]
        assert_results(s3objs, expected, ranges_fetched=ranges_in_order)

        # 2) test with missing items, default case
        if not_missing_urls != list(expected.keys()):
            urls_in_order = list(sorted(expected.keys(), reverse=True))
            ranges_in_order = []
            for url in urls_in_order:
                ranges_in_order.extend(v.range for v in expected[url].values())
            objs_in_order = list(
                starmap(S3GetObject, iter_objs(urls_in_order, expected)))
            fetched_urls = []
            for url in urls_in_order:
                fetched_urls.extend([url] * len(expected[url]))
            with pytest.raises(MetaflowS3NotFound):
                s3objs = s3.get_many(list(objs_in_order), return_info=True)

        # 3) test with missing items, return_missing=True

        # to test result ordering, make sure we are requesting
        # keys in a non-lexicographic order. Missing files should
        # be returned in order too
        # Here we can use urls_in_order, ranges_in_order and objs_in_order because they
        # always correspond to the full set
        s3objs = s3.get_many(list(objs_in_order),
                             return_missing=True,
                             return_info=True)
        assert fetched_urls == [e.url for e in s3objs]
        assert_results(s3objs, expected, ranges_fetched=ranges_in_order)
Beispiel #2
0
def test_get_exceptions(s3root, prefixes, expected):
    # get_many() goes via s3op, get() is a method - test both the code paths
    with S3() as s3:
        with pytest.raises(MetaflowS3AccessDenied):
            s3.get_many(["s3://foobar/foo"])
        with pytest.raises(MetaflowS3AccessDenied):
            s3.get("s3://foobar/foo")
    with S3(s3root=s3root) as s3:
        with pytest.raises(MetaflowS3NotFound):
            s3.get_many(["this_file_does_not_exist"])
        with pytest.raises(MetaflowS3NotFound):
            s3.get("this_file_does_not_exist")
Beispiel #3
0
def test_put_files(tempdir, s3root, blobs, expected):
    def _files(blobs):
        for blob in blobs:
            key = getattr(blob, "key", blob[0])
            data = getattr(blob, "value", blob[1])
            content_type = getattr(blob, "content_type", None)
            metadata = getattr(blob, "metadata", None)
            path = os.path.join(tempdir, key)
            with open(path, "wb") as f:
                f.write(data)
            yield S3PutObject(key=key,
                              value=path,
                              content_type=content_type,
                              metadata=metadata)

    with S3(s3root=s3root) as s3:
        s3urls = s3.put_files(_files(blobs))
        assert list(dict(s3urls)) == list(dict(blobs))

    with S3() as s3:
        # get urls
        s3objs = s3.get_many(dict(s3urls).values())
        assert_results(s3objs, expected)

    with S3(s3root=s3root) as s3:
        # get keys
        s3objs = s3.get_many(key for key, blob in blobs)
        assert {s3obj.key for s3obj in s3objs} == {key for key, _ in blobs}

    # upload shuffled blobs with overwrite disabled
    shuffled_blobs = blobs[:]
    shuffle(shuffled_blobs)
    with S3(s3root=s3root) as s3:
        overwrite_disabled_s3urls = s3.put_files(_files(shuffled_blobs),
                                                 overwrite=False)
        assert len(overwrite_disabled_s3urls) == 0

    with S3() as s3:
        s3objs = s3.get_many(dict(s3urls).values())
        assert_results(s3objs, expected)
    with S3(s3root=s3root) as s3:
        s3objs = s3.get_many(key for key, blob in shuffled_blobs)
        assert {s3obj.key
                for s3obj in s3objs} == {key
                                         for key, _ in shuffled_blobs}
Beispiel #4
0
def test_put_many(s3root, objs, expected):
    with S3(s3root=s3root) as s3:
        s3urls = s3.put_many(objs)
        assert list(dict(s3urls)) == list(dict(objs))
        # results must be in the same order as the keys requested
        for i in range(len(s3urls)):
            assert objs[i][0] == s3urls[i][0]
    with S3() as s3:
        s3objs = s3.get_many(dict(s3urls).values())
        assert_results(s3objs, expected)
    with S3(s3root=s3root) as s3:
        s3objs = s3.get_many(list(dict(objs)))
        assert {s3obj.key for s3obj in s3objs} == {key for key, _ in objs}

    # upload shuffled objs with overwrite disabled
    shuffled_objs = deranged_shuffle(objs)
    with S3(s3root=s3root) as s3:
        overwrite_disabled_s3urls = s3.put_many(shuffled_objs, overwrite=False)
        assert len(overwrite_disabled_s3urls) == 0
    with S3() as s3:
        s3objs = s3.get_many(dict(s3urls).values())
        assert_results(s3objs, expected)
Beispiel #5
0
def test_init_options(s3root, pathspecs, expected):
    [pathspec] = pathspecs
    flow_name, run_id = pathspec.split("/")
    plen = len(s3root)

    # option 1) s3root as prefix
    with S3(s3root=s3root) as s3:
        for url, exp in expected.items():
            # s3root should work as a prefix
            s3obj = s3.get(url[plen:])
            assert s3obj.key == url[plen:]
            assert_results([s3obj], {url: exp})
        with pytest.raises(MetaflowS3URLException):
            s3.get("s3://some/fake/address")

    # option 2) full url as s3root
    for url, exp in expected.items():
        with S3(s3root=url) as s3:
            s3obj = s3.get()
            assert_results([s3obj], {url: exp})

    # option 3) full urls
    with S3() as s3:
        for url, exp in expected.items():
            # s3root should work as a prefix
            s3obj = s3.get(url)
            assert s3obj.key == url
            assert_results([s3obj], {url: exp})
        with pytest.raises(MetaflowS3URLException):
            s3.get("suffix")
        with pytest.raises(MetaflowS3URLException):
            s3.get("s3://nopath")
        with pytest.raises(MetaflowS3URLException):
            s3.get_many(["suffixes"])
        with pytest.raises(MetaflowS3URLException):
            s3.get_recursive(["suffixes"])
        with pytest.raises(MetaflowS3URLException):
            s3.get_all()

    # option 4) 'current' environment (fake a running flow)
    flow = FakeFlow(use_cli=False)

    parsed = urlparse(s3root)
    with pytest.raises(MetaflowS3URLException):
        # current not set yet, so this should fail
        with S3(run=flow):
            pass

    current._set_env(
        FakeFlow(name=flow_name),
        run_id,
        "no_step",
        "no_task",
        "no_origin_run_id",
        "no_ns",
        "no_user",
    )

    with S3(bucket=parsed.netloc, prefix=parsed.path, run=flow) as s3:
        for url, exp in expected.items():
            name = url.split("/")[-1]
            s3obj = s3.get(name)
            assert s3obj.key == name
            assert_results([s3obj], {url: exp})
        names = [url.split("/")[-1] for url in expected]
        s3objs = s3.get_many(names)
        assert {e.key for e in s3objs} == set(names)
        assert_results(s3objs, expected)
        assert_results(s3.get_all(), expected, info_should_be_empty=True)

    # option 5) run object
    if DO_TEST_RUN:
        # Only works if a metadata service exists with the run in question.
        namespace(None)
        with S3(bucket=parsed.netloc, prefix=parsed.path,
                run=Run(pathspec)) as s3:
            names = [url.split("/")[-1] for url in expected]
            assert_results(s3.get_many(names), expected)
Beispiel #6
0
 def _do():
     with S3() as s3:
         # Use return_missing as this is the most expensive path
         res = s3.get_many(urls, return_missing=True)
     return res