def test_parse_query(): E = SimpleNamespace base = "s3://bucket/path/a/" assert parse_query(base) == E(base=base, depth=None, glob=None, file=None) assert parse_query(base + "some") == E(base=base + "some/", depth=None, glob=None, file=None) assert parse_query(base + "*") == E(base=base, depth=0, glob="*", file=None) assert parse_query(base + "*/*txt") == E(base=base, depth=1, glob="*txt", file=None) assert parse_query(base + "*/*/*txt") == E(base=base, depth=2, glob="*txt", file=None) assert parse_query(base + "*/*/file.txt") == E(base=base, depth=2, glob=None, file="file.txt") assert parse_query(base + "**/*txt") == E(base=base, depth=-1, glob="*txt", file=None) assert parse_query(base + "*/*/something/*yaml") == E(base=base, depth=3, glob="*yaml", file=None) with pytest.raises(ValueError): parse_query(base + "**/*/something/*yaml")
def cli(uri, skip_check): """ List files on S3 bucket. Example: \b List files in directory that match `*yaml` > s3-find 's3://mybucket/some/path/*yaml' \b List files in directory and all sub-directories that match `*yaml` > s3-find 's3://mybucket/some/path/**/*yaml' \b List files that match `*yaml` 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/*yaml' \b List directories 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/' \b List all files named `metadata.yaml` 2 directories deep > s3-find 's3://mybucket/some/path/*/*/metadata.yaml' """ def do_file_query(qq, pred): for d in s3.dir_dir(qq.base, qq.depth): _, _files = s3.list_dir(d).result() for f in _files: if pred(f): yield f def do_file_query2(qq): fname = qq.file stream = s3.dir_dir(qq.base, qq.depth) if skip_check: yield from (SimpleNamespace(url=d + fname) for d in stream) return stream = (s3.head_object(d + fname) for d in stream) for (f, _), _ in future_results(stream, 32): if f is not None: yield f def do_dir_query(qq): return (SimpleNamespace(url=url) for url in s3.dir_dir(qq.base, qq.depth)) flush_freq = 100 try: qq = parse_query(uri) except ValueError as e: click.echo(str(e), err=True) sys.exit(1) s3 = S3Fetcher() glob_or_file = qq.glob or qq.file if qq.depth is None and glob_or_file is None: stream = s3.find(qq.base) elif qq.depth is None or qq.depth < 0: if qq.glob: stream = s3.find(qq.base, glob=qq.glob) elif qq.file: postfix = '/' + qq.file stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix)) else: # fixed depth query if qq.glob is not None: pred = norm_predicate(glob=qq.glob) stream = do_file_query(qq, pred) elif qq.file is not None: stream = do_file_query2(qq) else: stream = do_dir_query(qq) try: for i, o in enumerate(stream): print(o.url, flush=(i % flush_freq == 0)) except Exception as e: print(f"ERROR: {str(e)}") sys.exit(1)
def s3_find_glob(glob_pattern: str, skip_check: bool): """Build generator from supplied S3 URI glob pattern Arguments: glob_pattern {str} -- Glob pattern to filter S3 Keys by skip_check {bool} -- Skip validity check for S3 Key Raises: ve: ValueError if the glob pattern cannot be parsed """ def do_file_query(qq, pred): for d in s3.dir_dir(qq.base, qq.depth): _, _files = s3.list_dir(d).result() for f in _files: if pred(f): yield f def do_file_query2(qq): fname = qq.file stream = s3.dir_dir(qq.base, qq.depth) if skip_check: yield from (SimpleNamespace(url=d + fname) for d in stream) return stream = (s3.head_object(d + fname) for d in stream) for (f, _), _ in future_results(stream, 32): if f is not None: yield f def do_dir_query(qq): return (SimpleNamespace(url=url) for url in s3.dir_dir(qq.base, qq.depth)) try: qq = parse_query(glob_pattern) except ValueError as ve: logging.error(f"URI glob-pattern not understood : {ve}") raise ve s3 = S3Fetcher() glob_or_file = qq.glob or qq.file if qq.depth is None and glob_or_file is None: stream = s3.find(qq.base) elif qq.depth is None or qq.depth < 0: if qq.glob: stream = s3.find(qq.base, glob=qq.glob) elif qq.file: postfix = '/' + qq.file stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix)) else: # fixed depth query if qq.glob is not None: pred = norm_predicate(glob=qq.glob) stream = do_file_query(qq, pred) elif qq.file is not None: stream = do_file_query2(qq) else: stream = do_dir_query(qq) return stream
def s3_find_glob(glob_pattern: str, skip_check: bool = False, s3: Optional[S3Fetcher] = None, **kw) -> Iterator[Any]: """ Build generator from supplied S3 URI glob pattern Arguments: glob_pattern {str} -- Glob pattern to filter S3 Keys by skip_check {bool} -- Skip validity check for S3 Key Raises: ve: ValueError if the glob pattern cannot be parsed """ if s3 is None: s3 = S3Fetcher() def do_file_query(qq, pred, dirs_pred=None): for d in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw): _, _files = s3.list_dir(d, **kw).result() for f in _files: if pred(f): yield f def do_file_query2(qq, dirs_pred=None): fname = qq.file stream = s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw) if skip_check: yield from (SimpleNamespace(url=d + fname) for d in stream) return stream = (s3.head_object(d + fname, **kw) for d in stream) for (f, _), _ in future_results(stream, 32): if f is not None: yield f def do_dir_query(qq, dirs_pred=None): return (SimpleNamespace(url=url) for url in s3.dir_dir(qq.base, qq.depth, pred=dirs_pred, **kw)) try: qq = parse_query(glob_pattern) except ValueError as ve: logging.error(f"URI glob-pattern not understood : {ve}") raise ve glob_or_file = qq.glob or qq.file if qq.depth is None and glob_or_file is None: stream = s3.find(qq.base, **kw) elif qq.depth is None or qq.depth < 0: if qq.glob: stream = s3.find(qq.base, glob=qq.glob, **kw) elif qq.file: postfix = "/" + qq.file stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix), **kw) else: # fixed depth query _, prefix = s3_url_parse(glob_pattern) dirs_glob = prefix.split("/")[:-1] def dirs_pred(f): n = f.count("/") _glob = "/".join(dirs_glob[:n]) + "/" return fnmatch(f, _glob) if qq.glob is not None: pred = norm_predicate(glob=qq.glob) stream = do_file_query(qq, pred, dirs_pred=dirs_pred) elif qq.file is not None: stream = do_file_query2(qq, dirs_pred=dirs_pred) else: stream = do_dir_query(qq, dirs_pred=dirs_pred) return stream