def run(): base_dir = pathlib.Path(__file__).absolute().parent.parent data_file = os.path.join(base_dir, "data", "datapackage.json") package = Package(data_file) resource = package.get_resource("content-pages") for row in resource.read_rows(): try: resp = requests.get(row["url"]) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # fixup relative css hrefs to be absolute for link in soup.find_all("link", attrs={"rel": "stylesheet"}): href = urljoin(row["url"], link["href"]) link["href"] = href # remove js and forms for remove in ["script", "form"]: for item in soup.find_all(remove): item.decompose() # remove meta tags apart from encoding for m in soup.find_all("meta"): try: if (m is not None and m.get("charset") is None and m.get("content") != "text/html; charset=utf-8"): m.decompose() except Exception as e: print(e) # gov.uk page that was tested had a specific aria attr # with dynamic id attached which changes and can be ignored for diffs labelled_ids = [] for tag in soup(): if tag.attrs.get("aria-labelledby"): labelled_ids.append(tag.attrs.get("aria-labelledby")) del tag.attrs["aria-labelledby"] for id in labelled_ids: for element in soup.find_all(id=id): del element["id"] for tag in soup.find_all(class_="attachment embedded"): del tag["id"] html = soup.prettify() out = os.path.join(base_dir, "collected", f"{row['id']}.html") updated = content_updated(out, html) with open(out, "w") as html_file: html_file.write(html) if updated: package.update(updated=datetime.now().isoformat()) package.to_json(data_file) except requests.HTTPError as e: print(f"Error getting {row['url']}") print(e)
def test_package_to_json(tmpdir): # Write target = os.path.join(tmpdir, "package.json") package = Package("data/package.json") package.to_json(target) # Read with open(target, encoding="utf-8") as file: assert package == json.load(file)
class OutputDataPackage(OutputBase): """Output a tabular data package (https://specs.frictionlessdata.io/data-package/). """ def __init__(self, inputs, schema, output_folder='_site', translations=None, indicator_options=None, data_schema=None, package_properties=None, resource_properties=None, field_properties=None, sorting='alphabetical', logging=None): """Constructor for OutputDataPackage. Parameters ---------- Inherits all the parameters from OutputBase, plus the following: data_schema : DataSchemaInputBase An optional subclass of DataSchemaInputBase package_properties : dict Common properties to add to all the data packages. Note that this can also be set per-indicator in a "datapackage" metadata property. resource_properties : dict Common properties to add to the resource in all data packages. Note that this can also be set per-indicator in a "datapackage" metadata property. field_properties : dict of dicts Properties to add to specific fields, keyed by field name. Note that this can also be set per-indicator in a "datapackage" metadata property. sorting : string Strategy to use when sorting the columns and values. Options are: - alphabetical: Sort columns/values in alphabetical order - default: Use the default sorting, either from the data_schema or from the position of columns/values in the source data """ OutputBase.__init__(self, inputs, schema, output_folder=output_folder, translations=translations, indicator_options=indicator_options, logging=logging) self.top_level_package = None self.data_schema = data_schema if package_properties is None: package_properties = {} if resource_properties is None: resource_properties = {} if field_properties is None: field_properties = {} self.package_properties = package_properties self.resource_properties = resource_properties self.field_properties = field_properties self.sorting = sorting self.per_indicator_metadata_field = 'datapackage' def get_base_folder(self): return 'data-packages' def build(self, language=None): """Write the JSON output expected by Open SDG. Overrides parent.""" status = True self.create_top_level_package('all', 'All indicators') backup_data_schema = DataSchemaInputIndicator(source=self.indicators) data_schema_not_specified = False if self.data_schema is None: data_schema_not_specified = True self.data_schema = backup_data_schema for indicator_id in self.get_indicator_ids(): # Make sure the folder exists. package_folder = os.path.join(self.output_folder, self.get_base_folder(), indicator_id) Path(package_folder).mkdir(parents=True, exist_ok=True) indicator = self.get_indicator_by_id(indicator_id).language( language) backup_schema_was_used = False data_schema_for_indicator = self.data_schema.get_schema_for_indicator( indicator) if data_schema_for_indicator is None: backup_schema_was_used = True data_schema_for_indicator = backup_data_schema.get_schema_for_indicator( indicator) # Clone the schema so that it can be sorted and translated. data_schema_for_indicator = Schema(dict(data_schema_for_indicator)) # We only sort the schema if it was not specified or if # the backup schema was used. if data_schema_not_specified or backup_schema_was_used: self.sort_data_schema(data_schema_for_indicator) if language is not None: self.translate_data_schema(data_schema_for_indicator, language) # Write the data. data_path = os.path.join(package_folder, 'data.csv') self.write_data(indicator.data, data_path) # Write the descriptor. descriptor_path = os.path.join(package_folder, 'datapackage.json') indicator_package = self.create_indicator_package( data_schema_for_indicator, data_path, indicator_id, indicator) self.write_indicator_package(indicator_package, descriptor_path, indicator, language=language) # Add to the top level package. top_level_data_path = indicator_id + '/data.csv' top_level_resource = self.create_resource( data_schema_for_indicator, data_path, indicator_id, indicator, top_level_data_path) self.add_to_top_level_package(top_level_resource) top_level_descriptor_path = os.path.join(self.output_folder, self.get_base_folder(), 'all.json') self.write_top_level_package(top_level_descriptor_path, language=language) return status def create_top_level_package(self, name, title): self.top_level_package = Package(self.package_properties) self.top_level_package.name = name self.top_level_package.title = title def write_top_level_package(self, path, language=None): self.top_level_package.to_json(path) # Workaround for https://github.com/frictionlessdata/frictionless-py/issues/788 os.chmod(path, 0o644) def add_to_top_level_package(self, resource): self.top_level_package.add_resource(resource) def write_data(self, df, path): df.to_csv(path, index=False) def apply_package_properties(self, package, indicator): for key in self.package_properties: package[key] = self.package_properties[key] indicator_props = self.get_properties_per_indicator( indicator, 'package_properties') for key in indicator_props: package[key] = indicator_props[key] def apply_resource_properties(self, resource, indicator): for key in self.resource_properties: resource[key] = self.resource_properties[key] indicator_props = self.get_properties_per_indicator( indicator, 'resource_properties') for key in indicator_props: resource[key] = indicator_props[key] def apply_field_properties(self, resource, indicator): for field_name in self.field_properties: field = resource.get_field(field_name) if field is not None: for key in self.field_properties[field_name]: field[key] = self.field_properties[field_name][key] indicator_props = self.get_properties_per_indicator( indicator, 'field_properties') for field_name in indicator_props: field = resource.get_field(field_name) if field is not None: for key in indicator_props[field_name]: field[key] = indicator_props[field_name][key] def create_resource(self, schema, data_path, indicator_id, indicator, data_path_override=None): resource = describe_resource(data_path) self.apply_resource_properties(resource, indicator) resource.schema = schema resource.path = data_path_override if data_path_override is not None else data_path resource.name = indicator_id resource.title = indicator.get_name() self.apply_field_properties(resource, indicator) return resource def create_indicator_package(self, schema, data_path, indicator_id, indicator): package = describe_package(data_path) self.apply_package_properties(package, indicator) package.name = indicator_id package.title = indicator.get_name() resource = package.get_resource('data') self.apply_resource_properties(resource, indicator) resource.schema = schema resource.path = 'data.csv' return package def write_indicator_package(self, package, descriptor_path, indicator, language=None): package.to_json(descriptor_path) # Workaround for https://github.com/frictionlessdata/frictionless-py/issues/788 os.chmod(descriptor_path, 0o644) def sort_data_schema(self, schema): if self.sorting == 'alphabetical': schema.fields.sort(key=lambda field: field.name) for field in schema.fields: if 'enum' in field.constraints: field.constraints['enum'].sort() def translate_data_schema(self, schema, language): groups_common = ['data'] for field in schema.fields: groups = groups_common + [field.name] # Translate the field titles. field.title = self.translation_helper.translate( field.name, language, groups) # Translate the values. if 'enum' in field.constraints: field.constraints['enum'] = [ self.translation_helper.translate(value, language, groups) for value in field.constraints['enum'] ] def get_properties_per_indicator(self, indicator, property_type): if indicator is None: return {} per_indicator = indicator.get_meta_field_value( self.per_indicator_metadata_field) if per_indicator is not None and property_type in per_indicator: return per_indicator[property_type] else: return {} def validate(self): """Validate the data for the indicators.""" status = True if self.data_schema is not None: for inid in self.indicators: status = status & self.data_schema.validate( self.indicators[inid]) return status def get_documentation_title(self): return 'Data packages' def get_documentation_content(self, languages=None, baseurl=''): if languages is None: languages = [''] indicator_ids = self.get_documentation_indicator_ids() descriptor_endpoint = '{language}/{base_folder}/{indicator_id}/datapackage.json' data_endpoint = '{language}/{base_folder}/{indicator_id}/data.csv' all_endpoint = '{language}/{base_folder}/all.json' output = '<p>' + self.get_documentation_description( ) + ' Examples are below:<p>' output += '<ul>' for language in languages: for indicator_id in indicator_ids: descriptor_path = descriptor_endpoint.format( language=language, indicator_id=indicator_id, base_folder=self.get_base_folder()) data_path = data_endpoint.format( language=language, indicator_id=indicator_id, base_folder=self.get_base_folder()) output += '<li>' + indicator_id + ':<ul>' output += '<li><a href="' + baseurl + descriptor_path + '">' + descriptor_path + '</a></li>' output += '<li><a href="' + baseurl + data_path + '">' + data_path + '</a></li>' output += '</ul></li>' all_path = all_endpoint.format(language=language, base_folder=self.get_base_folder()) output += '<li>All indicators: <a href="' + baseurl + all_path + '">' + all_path + '</a></li>' output += '<li>etc...</li>' output += '</ul>' return output def get_documentation_description(self): return """This output produces <a href="https://specs.frictionlessdata.io/data-package/">