コード例 #1
0
def test_package_to_zip_source_remote(tmpdir):

    # Write
    path = BASE_URL % "data/table.csv"
    target = os.path.join(tmpdir, "datapackage.zip")
    package = Package(name="name", resources=[{"name": "name", "path": path}])
    package.to_zip(target)

    # Read
    package = Package(target)
    assert package == {
        "name": "name",
        "resources": [{
            "name": "name",
            "path": path
        }],
    }
    assert package.get_resource("name").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #2
0
def test_package_to_zip_source_inline(tmpdir):

    # Read
    target = os.path.join(tmpdir, "datapackage.zip")
    data = [["id", "name"], ["1", "english"], ["2", "中国人"]]
    package = Package(name="name", resources=[{"name": "name", "data": data}])
    package.to_zip(target)

    # Write
    package = Package(target)
    assert package == {
        "name": "name",
        "resources": [{
            "name": "name",
            "data": data
        }],
    }
    assert package.get_resource("name").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #3
0
def test_package_compression_explicit_zip():
    package = Package("data/compression/datapackage.json")
    resource = package.get_resource("explicit-zip")
    assert resource.read_rows() == [
        {"id": 1, "name": "english"},
        {"id": 2, "name": "中国人"},
    ]
コード例 #4
0
def test_package_to_zip(tmpdir):

    # Write
    target = os.path.join(tmpdir, "datapackage.zip")
    package = Package("data/package.json")
    package.to_zip(target)

    # Read
    package = Package(target)
    assert package == {
        "name": "name",
        "resources": [{
            "name": "name",
            "path": "table.csv"
        }],
    }
    assert package.get_resource("name").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #5
0
def test_package_integrity_foreign_key_invalid():
    package = Package(DESCRIPTOR_FK)
    package.resources[1].data[3][0] = "bad"
    resource = package.get_resource("main")
    rows = resource.read_rows()
    assert rows[0].valid
    assert rows[1].valid
    assert rows[2].errors[0].code == "foreign-key-error"
    assert rows[0].to_dict() == {
        "id": "1",
        "name": "Alex",
        "surname": "Martin",
        "parent_id": None,
    }
    assert rows[1].to_dict() == {
        "id": "2",
        "name": "John",
        "surname": "Dockins",
        "parent_id": "1",
    }
    assert rows[2].to_dict() == {
        "id": "3",
        "name": "Walter",
        "surname": "White",
        "parent_id": "2",
    }
コード例 #6
0
def test_package_integrity_foreign_key():
    package = Package(DESCRIPTOR_FK)
    resource = package.get_resource("main")
    rows = resource.read_rows()
    assert rows[0].valid
    assert rows[1].valid
    assert rows[2].valid
    assert rows[0].to_dict() == {
        "id": "1",
        "name": "Alex",
        "surname": "Martin",
        "parent_id": None,
    }
    assert rows[1].to_dict() == {
        "id": "2",
        "name": "John",
        "surname": "Dockins",
        "parent_id": "1",
    }
    assert rows[2].to_dict() == {
        "id": "3",
        "name": "Walter",
        "surname": "White",
        "parent_id": "2",
    }
コード例 #7
0
def load_datapackage(datapackage_ref, temp_dir_loc='./temp', return_type='df', set_index=True):
    datapackage_resource = datapackage_ref['resource']
    external_datapackage = Package(datapackage_ref['package'])
    resource = external_datapackage.get_resource(datapackage_resource)

    if return_type == 'package':

        return external_datapackage
    elif return_type == 'resource':
        return resource

    elif return_type == 'df':
        df_resource = resource.to_pandas()

        if set_index == True:
            assert isinstance(datapackage_ref['external_fk_field'], str) or len(datapackage_ref['external_fk_field']==1), 'Only one primary key was expected to be matched on in the external datapackage'
            field_types = [field['type'] for field in resource.schema['fields'] if field['name']==datapackage_ref['external_fk_field']]
            field_names = [field['name'] for field in resource.schema['fields']]
            assert len(field_types) == 1, f'Expected only one field type, instead received: {", ".join(field_types)} for {datapackage_ref["external_fk_field"]}, {field_names}'
            field_type = field_types[0]

            if 'alt_indexes' in datapackage_ref.keys():
                alt_indexes = datapackage_ref['alt_indexes']
            else:
                alt_indexes = None

            df_resource = assign_idx_fields(df_resource, datapackage_ref['external_fk_field'], field_type, alt_indexes)

        return df_resource

    else:
        raise ValueError('`` must be one of ["df", "resource", "package"]')
        return resource
コード例 #8
0
def test_package_integrity_lookup():
    package = Package(DESCRIPTOR_FK)
    with package.get_resource("main") as resource:
        assert resource.lookup == {
            "people": {
                ("firstname", ): {("Walter", ), ("Alex", ), ("John", )}
            }
        }
コード例 #9
0
def test_sql_storage_postgresql_constraints_not_valid_error(postgresql_url, name, cell):
    package = Package("data/storage/constraints.json")
    resource = package.get_resource("constraints")
    # We set an invalid cell to the data property
    for index, field in enumerate(resource.schema.fields):
        if field.name == name:
            resource.data[1][index] = cell
    with pytest.raises((sa.exc.IntegrityError, sa.exc.DataError)):
        resource.write(postgresql_url, dialect={"table": "table"})
コード例 #10
0
def test_package_integrity_read_lookup():
    package = Package(DESCRIPTOR_FK)
    resource = package.get_resource("main")
    lookup = resource.read_lookup()
    assert lookup == {
        "people": {
            ("firstname", ): {("Walter", ), ("Alex", ), ("John", )}
        }
    }
コード例 #11
0
ファイル: test_package.py プロジェクト: roll/frictionless-py
def test_package_from_zip_remote():
    package = Package(BASEURL % "data/package.zip")
    assert package.name == "testing"
    assert len(package.resources) == 2
    assert package.get_resource("data2").read_rows() == [
        {"parent": "A3001", "comment": "comment1"},
        {"parent": "A3001", "comment": "comment2"},
        {"parent": "A5032", "comment": "comment3"},
    ]
コード例 #12
0
    def run():
        base_dir = pathlib.Path(__file__).absolute().parent.parent
        data_file = os.path.join(base_dir, "data", "datapackage.json")
        package = Package(data_file)
        resource = package.get_resource("content-pages")
        for row in resource.read_rows():
            try:
                resp = requests.get(row["url"])
                resp.raise_for_status()
                soup = BeautifulSoup(resp.text, "html.parser")

                # fixup relative css hrefs to be absolute
                for link in soup.find_all("link", attrs={"rel": "stylesheet"}):
                    href = urljoin(row["url"], link["href"])
                    link["href"] = href

                # remove js and forms
                for remove in ["script", "form"]:
                    for item in soup.find_all(remove):
                        item.decompose()

                # remove meta tags apart from encoding
                for m in soup.find_all("meta"):
                    try:
                        if (m is not None and m.get("charset") is None
                                and m.get("content") !=
                                "text/html; charset=utf-8"):
                            m.decompose()
                    except Exception as e:
                        print(e)

                # gov.uk page that was tested had a specific aria attr
                # with dynamic id attached which changes and can be ignored for diffs
                labelled_ids = []
                for tag in soup():
                    if tag.attrs.get("aria-labelledby"):
                        labelled_ids.append(tag.attrs.get("aria-labelledby"))
                        del tag.attrs["aria-labelledby"]
                for id in labelled_ids:
                    for element in soup.find_all(id=id):
                        del element["id"]

                for tag in soup.find_all(class_="attachment embedded"):
                    del tag["id"]

                html = soup.prettify()
                out = os.path.join(base_dir, "collected", f"{row['id']}.html")
                updated = content_updated(out, html)
                with open(out, "w") as html_file:
                    html_file.write(html)
                if updated:
                    package.update(updated=datetime.now().isoformat())
                    package.to_json(data_file)

            except requests.HTTPError as e:
                print(f"Error getting {row['url']}")
                print(e)
コード例 #13
0
def get_dp_field_to_title(datapackage_json_fp):
    package = Package(datapackage_json_fp, profile='tabular-data-package')
    ids_resource = package.get_resource('ids')

    id_field_to_title = {
        field['name']: field['title']
        for field in ids_resource['schema']['fields']
    }

    return id_field_to_title
コード例 #14
0
def test_sql_storage_sqlite_constraints_not_valid_error(sqlite_url, field_name, cell):
    package = Package("data/storage/constraints.json")
    resource = package.get_resource("constraints")
    # We set an invalid cell to the data property
    for index, field in enumerate(resource.schema.fields):
        if field.name == field_name:
            resource.data[1][index] = cell
    # NOTE: should we wrap these exceptions?
    with pytest.raises(sa.exc.IntegrityError):
        resource.write(sqlite_url, dialect={"table": "table"})
コード例 #15
0
ファイル: test_package.py プロジェクト: roll/frictionless-py
def test_package_schema_foreign_key_self_reference():
    package = Package(DESCRIPTOR_FK)
    package.resources[0].schema.foreign_keys = [
        {"fields": "parent_id", "reference": {"resource": "", "fields": "id"}}
    ]
    resource = package.get_resource("main")
    rows = resource.read_rows()
    assert rows[0].valid
    assert rows[1].valid
    assert rows[2].valid
コード例 #16
0
def test_postgresql_storage_constraints_not_valid_error(
        database_url, field_name, cell):
    engine = sa.create_engine(os.environ["POSTGRESQL_URL"])
    package = Package("data/storage/constraints.json")
    resource = package.get_resource("constraints")
    # We set an invalid cell to the data property
    for index, field in enumerate(resource.schema.fields):
        if field.name == field_name:
            resource.data[1][index] = cell
    with pytest.raises((sa.exc.IntegrityError, sa.exc.DataError)):
        resource.to_sql(engine=engine, force=True)
コード例 #17
0
def test_storage_constraints_not_valid_error(database_url, field_name, cell):
    engine = sa.create_engine(database_url)
    package = Package("data/storage/constraints.json")
    resource = package.get_resource("constraints")
    # We set an invalid cell to the data property
    for index, field in enumerate(resource.schema.fields):
        if field.name == field_name:
            resource.data[1][index] = cell
    # NOTE: should we wrap these exceptions?
    with pytest.raises(sa.exc.IntegrityError):
        resource.to_sql(engine=engine, force=True)
コード例 #18
0
def get_dp_field_to_url_format_str(datapackage_json_fp):
    package = Package(datapackage_json_fp, profile='tabular-data-package')
    ids_resource = package.get_resource('ids')

    id_field_to_url_format_str = {
        field['name']: field['url_format']
        for field in ids_resource['schema']['fields']
        if 'url_format' in field.keys()
    }

    return id_field_to_url_format_str
コード例 #19
0
def test_package_integrity_foreign_key_self_reference_invalid():
    package = Package(DESCRIPTOR_FK)
    package.resources[0].data[2][0] = "0"
    package.resources[0].schema.foreign_keys = [
        {"fields": "parent_id", "reference": {"resource": "", "fields": "id"}}
    ]
    resource = package.get_resource("main")
    rows = resource.read_rows()
    assert rows[0].valid
    assert rows[1].valid
    assert rows[2].errors[0].code == "foreign-key-error"
コード例 #20
0
ファイル: test_package.py プロジェクト: roll/frictionless-py
def test_package_resources_inline():
    data = [["id", "name"], ["1", "english"], ["2", "中国人"]]
    package = Package({"resources": [{"name": "name", "data": data}]})
    resource = package.get_resource("name")
    assert len(package.resources) == 1
    assert resource.path is None
    assert resource.data == data
    assert resource.fullpath is None
    assert resource.read_rows() == [
        {"id": 1, "name": "english"},
        {"id": 2, "name": "中国人"},
    ]
コード例 #21
0
def test_package_to_zip_resolve_inline_and_remote(tmpdir):

    # Write
    target = os.path.join(tmpdir, "package.zip")
    resource1 = Resource(name="name1", data=[["id", "name"], [1, "english"], [2, "中国人"]])
    resource2 = Resource(name="name2", path=BASE_URL % "data/table.csv")
    package = Package(resources=[resource1, resource2])
    package.to_zip(target, resolve=["inline", "remote"])

    # Read
    package = Package(target)
    assert package.get_resource("name1").path == "name1.csv"
    assert package.get_resource("name1").read_rows() == [
        {"id": 1, "name": "english"},
        {"id": 2, "name": "中国人"},
    ]
    assert package.get_resource("name2").path == "name2.csv"
    assert package.get_resource("name2").read_rows() == [
        {"id": 1, "name": "english"},
        {"id": 2, "name": "中国人"},
    ]
コード例 #22
0
def test_package_to_zip_resolve_remote(tmpdir):

    # Write
    target = os.path.join(tmpdir, "package.zip")
    resource = Resource(path=BASE_URL % "data/table.csv")
    package = Package(resources=[resource])
    package.to_zip(target, resolve=["remote"])

    # Read
    package = Package(target)
    assert package.get_resource("table").path == "table.csv"
    assert package.get_resource("table").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #23
0
def test_package_to_zip_resolve_inline_sql(tmpdir, database_url):

    # Write
    target = os.path.join(tmpdir, "package.zip")
    resource = Resource.from_sql(name="table", url=database_url)
    package = Package(resources=[resource])
    package.to_zip(target, resolve=["inline"])

    # Read
    package = Package(target)
    assert package.get_resource("table").path == "table.csv"
    assert package.get_resource("table").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #24
0
def test_package_to_zip_absolute_path(tmpdir):

    # Write
    target = os.path.join(tmpdir, "package.zip")
    resource = Resource(path=os.path.abspath("data/table.csv"), trusted=True)
    package = Package(resources=[resource], trusted=True)
    package.to_zip(target)

    # Read
    package = Package(target)
    assert package.get_resource("table").path == "table.csv"
    assert package.get_resource("table").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #25
0
def test_package_to_zip_withdir_path(tmpdir):

    # Write
    target = os.path.join(tmpdir, "package.zip")
    resource = Resource(path="data/table.csv")
    package = Package(resources=[resource])
    package.to_zip(target)

    # Read
    package = Package.from_zip(target)
    assert package.get_resource("table").path == "data/table.csv"
    assert package.get_resource("table").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #26
0
def test_package_to_zip_resolve_memory_sql(tmpdir, database_url):

    # Write
    target = os.path.join(tmpdir, "package.zip")
    resource = Resource(database_url, dialect={"table": "table"})
    package = Package(resources=[resource])
    package.to_zip(target, resolve=["memory"])

    # Read
    package = Package.from_zip(target)
    assert package.get_resource("table").path == "table.csv"
    assert package.get_resource("table").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #27
0
ファイル: test_package.py プロジェクト: roll/frictionless-py
def test_package_schema_foreign_key_multifield():
    package = Package(DESCRIPTOR_FK)
    package.resources[0].schema.foreign_keys = [
        {
            "fields": ["name", "surname"],
            "reference": {"resource": "people", "fields": ["firstname", "surname"]},
        }
    ]
    resource = package.get_resource("main")
    rows = resource.read_rows()
    assert rows[0].valid
    assert rows[1].valid
    assert rows[2].valid
コード例 #28
0
def test_package_to_zip_resolve_memory(tmpdir):

    # Write
    target = os.path.join(tmpdir, "package.zip")
    resource = Resource(name="table",
                        data=[["id", "name"], [1, "english"], [2, "中国人"]])
    package = Package(resources=[resource])
    package.to_zip(target, resolve=["memory"])

    # Read
    package = Package.from_zip(target)
    assert package.get_resource("table").path == "table.csv"
    assert package.get_resource("table").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
コード例 #29
0
def test_package_integrity_foreign_key_multifield_invalid():
    package = Package(DESCRIPTOR_FK)
    package.resources[0].schema.foreign_keys = [
        {
            "fields": ["name", "surname"],
            "reference": {"resource": "people", "fields": ["firstname", "surname"]},
        }
    ]
    package.resources[1].data[3][0] = "bad"
    resource = package.get_resource("main")
    rows = resource.read_rows()
    assert rows[0].valid
    assert rows[1].valid
    assert rows[2].errors[0].code == "foreign-key-error"
コード例 #30
0
def construct_dictionary_knowledge_graph(datapackage_fp, temp_dir_loc, resource_name='ids'):
    package = Package(datapackage_fp, profile='tabular-data-package')
    ids_resource = package.get_resource(resource_name)

    field_hierarchies = get_field_name_tags(ids_resource.schema)
    root_field = field_hierarchies_to_root(field_hierarchies)
    root_field_type = [field['type'] for field in ids_resource.schema['fields'] if field['name']==root_field][0]

    df_ids = assign_idx_fields(ids_resource.to_pandas(), root_field, root_field_type)
    site_data = initialise_site_data_with_ids(df_ids, field_hierarchies)

    fk_external_datapackage_refs = extract_external_foreignkey_datapackage_refs(ids_resource, primary_key_field=root_field)
    fk_external_datapackage_refs = add_resource_locs_to_external_datapackage_refs(fk_external_datapackage_refs)

    download_attribute_data_to_temp_dir(fk_external_datapackage_refs, temp_dir_loc=temp_dir_loc)
    site_data = extract_attrs_from_resource_dfs(site_data, fk_external_datapackage_refs, temp_dir_loc)
    site_data = json_nan_to_none(site_data)

    return site_data