Esempio n. 1
0
def _read_documents_impl(sample_document_files):
    # Test case for returning native points to documents, may be pathlib.Path or URI
    for filepath, num_docs in sample_document_files:
        all_docs = list(read_documents(filepath))
        assert len(all_docs) == num_docs

        for path, doc in all_docs:
            assert isinstance(doc, dict)

        assert set(str(f) for f, _ in all_docs) == set([filepath])

    # Test case for returning URIs pointing to documents
    for filepath, num_docs in sample_document_files:
        all_docs = list(read_documents(filepath, uri=True))
        assert len(all_docs) == num_docs

        for uri, doc in all_docs:
            assert isinstance(doc, dict)
            assert isinstance(uri, str)

        url = as_url(filepath)
        if num_docs > 1:
            expect_uris = [
                as_url(url) + '#part={}'.format(i) for i in range(num_docs)
            ]
        else:
            expect_uris = [as_url(url)]

        assert [f for f, _ in all_docs] == expect_uris
Esempio n. 2
0
def load_documents(path):
    """
    Load document/s from the specified path.

    At the moment can handle:

     - JSON and YAML locally and remotely.
     - Compressed JSON and YAML locally
     - Data Cube Dataset Documents inside local NetCDF files.

    :param path: path or URI to load documents from
    :return: generator of dicts
    """
    path = str(path)
    url = as_url(path)
    scheme = urlparse(url).scheme
    compressed = url[-3:] == '.gz'

    if scheme == 'file' and path[-3:] == '.nc':
        path = uri_to_local_path(url)
        yield from load_from_netcdf(path)
    else:
        with _PROTOCOL_OPENERS[scheme](url) as fh:
            if compressed:
                fh = gzip.open(fh)
                path = path[:-3]

            suffix = Path(path).suffix

            parser = _PARSERS[suffix]

            yield from parser(fh)
Esempio n. 3
0
def _test_read_docs_impl(sample_documents: Iterable[Tuple[str, int]]):
    # Test case for returning URIs pointing to documents
    for doc_url, num_docs in sample_documents:
        all_docs = list(read_documents(doc_url, uri=True))
        assert len(all_docs) == num_docs

        for uri, doc in all_docs:
            assert isinstance(doc, dict)
            assert isinstance(uri, str)

        url = as_url(doc_url)
        if num_docs > 1:
            expect_uris = [as_url(url) + '#part={}'.format(i) for i in range(num_docs)]
        else:
            expect_uris = [as_url(url)]

        assert [f for f, _ in all_docs] == expect_uris
Esempio n. 4
0
    def process_file(path):
        docs = load_documents(path)

        if not uri:
            for doc in docs:
                yield path, doc
        else:
            url = as_url(path)

            def add_uri_no_part(x):
                idx, doc = x
                return url, doc

            def add_uri_with_part(x):
                idx, doc = x
                return mk_part_uri(url, idx), doc

            yield from map_with_lookahead(enumerate(docs),
                                          if_one=add_uri_no_part,
                                          if_many=add_uri_with_part)
def test_is_url(test_input, expected):
    assert is_url(test_input) == expected
    if expected:
        assert as_url(test_input) is test_input