def test_package_to_zip_source_remote(tmpdir): # Write path = BASE_URL % "data/table.csv" target = os.path.join(tmpdir, "datapackage.zip") package = Package(name="name", resources=[{"name": "name", "path": path}]) package.to_zip(target) # Read package = Package(target) assert package == { "name": "name", "resources": [{ "name": "name", "path": path }], } assert package.get_resource("name").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_to_zip_source_inline(tmpdir): # Read target = os.path.join(tmpdir, "datapackage.zip") data = [["id", "name"], ["1", "english"], ["2", "中国人"]] package = Package(name="name", resources=[{"name": "name", "data": data}]) package.to_zip(target) # Write package = Package(target) assert package == { "name": "name", "resources": [{ "name": "name", "data": data }], } assert package.get_resource("name").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_compression_explicit_zip(): package = Package("data/compression/datapackage.json") resource = package.get_resource("explicit-zip") assert resource.read_rows() == [ {"id": 1, "name": "english"}, {"id": 2, "name": "中国人"}, ]
def test_package_to_zip(tmpdir): # Write target = os.path.join(tmpdir, "datapackage.zip") package = Package("data/package.json") package.to_zip(target) # Read package = Package(target) assert package == { "name": "name", "resources": [{ "name": "name", "path": "table.csv" }], } assert package.get_resource("name").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_integrity_foreign_key_invalid(): package = Package(DESCRIPTOR_FK) package.resources[1].data[3][0] = "bad" resource = package.get_resource("main") rows = resource.read_rows() assert rows[0].valid assert rows[1].valid assert rows[2].errors[0].code == "foreign-key-error" assert rows[0].to_dict() == { "id": "1", "name": "Alex", "surname": "Martin", "parent_id": None, } assert rows[1].to_dict() == { "id": "2", "name": "John", "surname": "Dockins", "parent_id": "1", } assert rows[2].to_dict() == { "id": "3", "name": "Walter", "surname": "White", "parent_id": "2", }
def test_package_integrity_foreign_key(): package = Package(DESCRIPTOR_FK) resource = package.get_resource("main") rows = resource.read_rows() assert rows[0].valid assert rows[1].valid assert rows[2].valid assert rows[0].to_dict() == { "id": "1", "name": "Alex", "surname": "Martin", "parent_id": None, } assert rows[1].to_dict() == { "id": "2", "name": "John", "surname": "Dockins", "parent_id": "1", } assert rows[2].to_dict() == { "id": "3", "name": "Walter", "surname": "White", "parent_id": "2", }
def load_datapackage(datapackage_ref, temp_dir_loc='./temp', return_type='df', set_index=True): datapackage_resource = datapackage_ref['resource'] external_datapackage = Package(datapackage_ref['package']) resource = external_datapackage.get_resource(datapackage_resource) if return_type == 'package': return external_datapackage elif return_type == 'resource': return resource elif return_type == 'df': df_resource = resource.to_pandas() if set_index == True: assert isinstance(datapackage_ref['external_fk_field'], str) or len(datapackage_ref['external_fk_field']==1), 'Only one primary key was expected to be matched on in the external datapackage' field_types = [field['type'] for field in resource.schema['fields'] if field['name']==datapackage_ref['external_fk_field']] field_names = [field['name'] for field in resource.schema['fields']] assert len(field_types) == 1, f'Expected only one field type, instead received: {", ".join(field_types)} for {datapackage_ref["external_fk_field"]}, {field_names}' field_type = field_types[0] if 'alt_indexes' in datapackage_ref.keys(): alt_indexes = datapackage_ref['alt_indexes'] else: alt_indexes = None df_resource = assign_idx_fields(df_resource, datapackage_ref['external_fk_field'], field_type, alt_indexes) return df_resource else: raise ValueError('`` must be one of ["df", "resource", "package"]') return resource
def test_package_integrity_lookup(): package = Package(DESCRIPTOR_FK) with package.get_resource("main") as resource: assert resource.lookup == { "people": { ("firstname", ): {("Walter", ), ("Alex", ), ("John", )} } }
def test_sql_storage_postgresql_constraints_not_valid_error(postgresql_url, name, cell): package = Package("data/storage/constraints.json") resource = package.get_resource("constraints") # We set an invalid cell to the data property for index, field in enumerate(resource.schema.fields): if field.name == name: resource.data[1][index] = cell with pytest.raises((sa.exc.IntegrityError, sa.exc.DataError)): resource.write(postgresql_url, dialect={"table": "table"})
def test_package_integrity_read_lookup(): package = Package(DESCRIPTOR_FK) resource = package.get_resource("main") lookup = resource.read_lookup() assert lookup == { "people": { ("firstname", ): {("Walter", ), ("Alex", ), ("John", )} } }
def test_package_from_zip_remote(): package = Package(BASEURL % "data/package.zip") assert package.name == "testing" assert len(package.resources) == 2 assert package.get_resource("data2").read_rows() == [ {"parent": "A3001", "comment": "comment1"}, {"parent": "A3001", "comment": "comment2"}, {"parent": "A5032", "comment": "comment3"}, ]
def run(): base_dir = pathlib.Path(__file__).absolute().parent.parent data_file = os.path.join(base_dir, "data", "datapackage.json") package = Package(data_file) resource = package.get_resource("content-pages") for row in resource.read_rows(): try: resp = requests.get(row["url"]) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # fixup relative css hrefs to be absolute for link in soup.find_all("link", attrs={"rel": "stylesheet"}): href = urljoin(row["url"], link["href"]) link["href"] = href # remove js and forms for remove in ["script", "form"]: for item in soup.find_all(remove): item.decompose() # remove meta tags apart from encoding for m in soup.find_all("meta"): try: if (m is not None and m.get("charset") is None and m.get("content") != "text/html; charset=utf-8"): m.decompose() except Exception as e: print(e) # gov.uk page that was tested had a specific aria attr # with dynamic id attached which changes and can be ignored for diffs labelled_ids = [] for tag in soup(): if tag.attrs.get("aria-labelledby"): labelled_ids.append(tag.attrs.get("aria-labelledby")) del tag.attrs["aria-labelledby"] for id in labelled_ids: for element in soup.find_all(id=id): del element["id"] for tag in soup.find_all(class_="attachment embedded"): del tag["id"] html = soup.prettify() out = os.path.join(base_dir, "collected", f"{row['id']}.html") updated = content_updated(out, html) with open(out, "w") as html_file: html_file.write(html) if updated: package.update(updated=datetime.now().isoformat()) package.to_json(data_file) except requests.HTTPError as e: print(f"Error getting {row['url']}") print(e)
def get_dp_field_to_title(datapackage_json_fp): package = Package(datapackage_json_fp, profile='tabular-data-package') ids_resource = package.get_resource('ids') id_field_to_title = { field['name']: field['title'] for field in ids_resource['schema']['fields'] } return id_field_to_title
def test_sql_storage_sqlite_constraints_not_valid_error(sqlite_url, field_name, cell): package = Package("data/storage/constraints.json") resource = package.get_resource("constraints") # We set an invalid cell to the data property for index, field in enumerate(resource.schema.fields): if field.name == field_name: resource.data[1][index] = cell # NOTE: should we wrap these exceptions? with pytest.raises(sa.exc.IntegrityError): resource.write(sqlite_url, dialect={"table": "table"})
def test_package_schema_foreign_key_self_reference(): package = Package(DESCRIPTOR_FK) package.resources[0].schema.foreign_keys = [ {"fields": "parent_id", "reference": {"resource": "", "fields": "id"}} ] resource = package.get_resource("main") rows = resource.read_rows() assert rows[0].valid assert rows[1].valid assert rows[2].valid
def test_postgresql_storage_constraints_not_valid_error( database_url, field_name, cell): engine = sa.create_engine(os.environ["POSTGRESQL_URL"]) package = Package("data/storage/constraints.json") resource = package.get_resource("constraints") # We set an invalid cell to the data property for index, field in enumerate(resource.schema.fields): if field.name == field_name: resource.data[1][index] = cell with pytest.raises((sa.exc.IntegrityError, sa.exc.DataError)): resource.to_sql(engine=engine, force=True)
def test_storage_constraints_not_valid_error(database_url, field_name, cell): engine = sa.create_engine(database_url) package = Package("data/storage/constraints.json") resource = package.get_resource("constraints") # We set an invalid cell to the data property for index, field in enumerate(resource.schema.fields): if field.name == field_name: resource.data[1][index] = cell # NOTE: should we wrap these exceptions? with pytest.raises(sa.exc.IntegrityError): resource.to_sql(engine=engine, force=True)
def get_dp_field_to_url_format_str(datapackage_json_fp): package = Package(datapackage_json_fp, profile='tabular-data-package') ids_resource = package.get_resource('ids') id_field_to_url_format_str = { field['name']: field['url_format'] for field in ids_resource['schema']['fields'] if 'url_format' in field.keys() } return id_field_to_url_format_str
def test_package_integrity_foreign_key_self_reference_invalid(): package = Package(DESCRIPTOR_FK) package.resources[0].data[2][0] = "0" package.resources[0].schema.foreign_keys = [ {"fields": "parent_id", "reference": {"resource": "", "fields": "id"}} ] resource = package.get_resource("main") rows = resource.read_rows() assert rows[0].valid assert rows[1].valid assert rows[2].errors[0].code == "foreign-key-error"
def test_package_resources_inline(): data = [["id", "name"], ["1", "english"], ["2", "中国人"]] package = Package({"resources": [{"name": "name", "data": data}]}) resource = package.get_resource("name") assert len(package.resources) == 1 assert resource.path is None assert resource.data == data assert resource.fullpath is None assert resource.read_rows() == [ {"id": 1, "name": "english"}, {"id": 2, "name": "中国人"}, ]
def test_package_to_zip_resolve_inline_and_remote(tmpdir): # Write target = os.path.join(tmpdir, "package.zip") resource1 = Resource(name="name1", data=[["id", "name"], [1, "english"], [2, "中国人"]]) resource2 = Resource(name="name2", path=BASE_URL % "data/table.csv") package = Package(resources=[resource1, resource2]) package.to_zip(target, resolve=["inline", "remote"]) # Read package = Package(target) assert package.get_resource("name1").path == "name1.csv" assert package.get_resource("name1").read_rows() == [ {"id": 1, "name": "english"}, {"id": 2, "name": "中国人"}, ] assert package.get_resource("name2").path == "name2.csv" assert package.get_resource("name2").read_rows() == [ {"id": 1, "name": "english"}, {"id": 2, "name": "中国人"}, ]
def test_package_to_zip_resolve_remote(tmpdir): # Write target = os.path.join(tmpdir, "package.zip") resource = Resource(path=BASE_URL % "data/table.csv") package = Package(resources=[resource]) package.to_zip(target, resolve=["remote"]) # Read package = Package(target) assert package.get_resource("table").path == "table.csv" assert package.get_resource("table").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_to_zip_resolve_inline_sql(tmpdir, database_url): # Write target = os.path.join(tmpdir, "package.zip") resource = Resource.from_sql(name="table", url=database_url) package = Package(resources=[resource]) package.to_zip(target, resolve=["inline"]) # Read package = Package(target) assert package.get_resource("table").path == "table.csv" assert package.get_resource("table").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_to_zip_absolute_path(tmpdir): # Write target = os.path.join(tmpdir, "package.zip") resource = Resource(path=os.path.abspath("data/table.csv"), trusted=True) package = Package(resources=[resource], trusted=True) package.to_zip(target) # Read package = Package(target) assert package.get_resource("table").path == "table.csv" assert package.get_resource("table").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_to_zip_withdir_path(tmpdir): # Write target = os.path.join(tmpdir, "package.zip") resource = Resource(path="data/table.csv") package = Package(resources=[resource]) package.to_zip(target) # Read package = Package.from_zip(target) assert package.get_resource("table").path == "data/table.csv" assert package.get_resource("table").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_to_zip_resolve_memory_sql(tmpdir, database_url): # Write target = os.path.join(tmpdir, "package.zip") resource = Resource(database_url, dialect={"table": "table"}) package = Package(resources=[resource]) package.to_zip(target, resolve=["memory"]) # Read package = Package.from_zip(target) assert package.get_resource("table").path == "table.csv" assert package.get_resource("table").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_schema_foreign_key_multifield(): package = Package(DESCRIPTOR_FK) package.resources[0].schema.foreign_keys = [ { "fields": ["name", "surname"], "reference": {"resource": "people", "fields": ["firstname", "surname"]}, } ] resource = package.get_resource("main") rows = resource.read_rows() assert rows[0].valid assert rows[1].valid assert rows[2].valid
def test_package_to_zip_resolve_memory(tmpdir): # Write target = os.path.join(tmpdir, "package.zip") resource = Resource(name="table", data=[["id", "name"], [1, "english"], [2, "中国人"]]) package = Package(resources=[resource]) package.to_zip(target, resolve=["memory"]) # Read package = Package.from_zip(target) assert package.get_resource("table").path == "table.csv" assert package.get_resource("table").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_integrity_foreign_key_multifield_invalid(): package = Package(DESCRIPTOR_FK) package.resources[0].schema.foreign_keys = [ { "fields": ["name", "surname"], "reference": {"resource": "people", "fields": ["firstname", "surname"]}, } ] package.resources[1].data[3][0] = "bad" resource = package.get_resource("main") rows = resource.read_rows() assert rows[0].valid assert rows[1].valid assert rows[2].errors[0].code == "foreign-key-error"
def construct_dictionary_knowledge_graph(datapackage_fp, temp_dir_loc, resource_name='ids'): package = Package(datapackage_fp, profile='tabular-data-package') ids_resource = package.get_resource(resource_name) field_hierarchies = get_field_name_tags(ids_resource.schema) root_field = field_hierarchies_to_root(field_hierarchies) root_field_type = [field['type'] for field in ids_resource.schema['fields'] if field['name']==root_field][0] df_ids = assign_idx_fields(ids_resource.to_pandas(), root_field, root_field_type) site_data = initialise_site_data_with_ids(df_ids, field_hierarchies) fk_external_datapackage_refs = extract_external_foreignkey_datapackage_refs(ids_resource, primary_key_field=root_field) fk_external_datapackage_refs = add_resource_locs_to_external_datapackage_refs(fk_external_datapackage_refs) download_attribute_data_to_temp_dir(fk_external_datapackage_refs, temp_dir_loc=temp_dir_loc) site_data = extract_attrs_from_resource_dfs(site_data, fk_external_datapackage_refs, temp_dir_loc) site_data = json_nan_to_none(site_data) return site_data