Exemple #1
0
def _test_script_world_bank(DIRECTORY, script, rebase=False):
    """World Bank urban population time-series, in wide format. Demonstrate define a schema,
    create a method, import data, perform actions and generate a schema-compliant output.
    From https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL"""
    SCHEMA_NAME = "/data/urban_population.json"
    SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME
    SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
    SOURCE_DATA = SOURCE_DIRECTORY + "/data/working_test_world_bank_data.xlsx"
    method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
    method.set({"name": "urban_population_method"})
    method.add_data(source=SOURCE_DATA)
    scripts = []
    if rebase:
        scripts = [
            "DEBLANK",
            "DEDUPE",
            "REBASE < [2]",
        ]
    # Run the test script
    scripts.append(script)
    source_data = method.get.input_data[0]
    method.add_actions(scripts,
                       source_data.uuid.hex,
                       sheet_name=source_data.sheet_name)
    method.transform(source_data)
    return True
Exemple #2
0
 def test_build(self):
     s = whyqd.Schema()
     details = {"name": data["name"], "title": data["title"], "description": data["description"]}
     s.set(schema=details)
     for field in data["fields"]:
         s.add_field(field=field)
     # also this https://github.com/samuelcolvin/pydantic/issues/1283#issuecomment-594041870
     # foo_excludes = {idx: {"id"} for idx in range(len(my_bar.foos))}
     # my_bar.dict(exclude={"foos": foo_excludes})
     # https://pydantic-docs.helpmanual.io/usage/exporting_models/#advanced-include-and-exclude
     schema_exclude = {
         f_idx: (
             {
                 "uuid": ...,
                 "constraints": {"category": {c_idx: {"uuid"} for c_idx in range(len(f.constraints.category))}},
             }
             if f.constraints
             else {"uuid"}
         )
         for f_idx, f in enumerate(s.get.fields)
     }
     d = s.get.dict(
         by_alias=True, exclude_defaults=True, exclude_none=True, exclude={"uuid": ..., "fields": schema_exclude}
     )
     assert d == data
Exemple #3
0
def _test_script_portsmouth(DIRECTORY, script, test_filter=False):
    """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date,
    actions and perform a merge, plus filter the final result."""
    SCHEMA_NAME = "/data/test_schema.json"
    SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME
    SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
    INPUT_DATA = SOURCE_DIRECTORY + "/data/working_test_data.xlsx"
    if test_filter:
        INPUT_DATA = SOURCE_DIRECTORY + "/data/restructured_test_data.xlsx"
    method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
    method.set({"name": "test_method"})
    method.add_data(source=INPUT_DATA)
    source_data = method.get.input_data[0]
    method.add_actions(script,
                       source_data.uuid.hex,
                       sheet_name=source_data.sheet_name)
    method.transform(source_data)
    return True
Exemple #4
0
 def test_create(self):
     s = whyqd.Schema()
     field: whyqd.FieldModel = {
         "name": "test_field",
         "type": "string",
         "constraints": {
             "required": True,
             "category": [
                 {"name": "dog", "description": "A type of mammal"},
                 {"name": "cat", "description": "A different type of mammal"},
                 {"name": "mouse", "description": "A small type of mammal"},
             ],
         },
     }
     schema: whyqd.SchemaModel = {
         "name": "test_schema",
     }
     s.set(schema)
     s.add_field(field)
Exemple #5
0
 def test_merge(self, tmp_path):
     """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date,
     actions and perform a merge, plus filter the final result."""
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     SCHEMA_NAME = "/data/test_schema.json"
     SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME
     SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
     INPUT_DATA = [
         SOURCE_DIRECTORY + "/data/raw_E06000044_014_0.XLSX",
         SOURCE_DIRECTORY + "/data/raw_E06000044_014_1.XLSX",
         SOURCE_DIRECTORY + "/data/raw_E06000044_014_2.XLSX",
     ]
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "test_method"})
     input_data = [{"path": d} for d in INPUT_DATA]
     method.add_data(source=input_data)
     # reorder
     input_order = [m.uuid.hex for m in method.get.input_data]
     input_order.reverse()
     method.reorder_data(order=input_order)
     # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]"
     merge_reference = [
         {
             "source_hex": method.get.input_data[0].uuid.hex,
             "key_column": "Property ref no"
         },
         {
             "source_hex": method.get.input_data[1].uuid.hex,
             "key_column": "Property Reference Number"
         },
         {
             "source_hex": method.get.input_data[2].uuid.hex,
             "key_column": "Property Reference Number"
         },
     ]
     merge_terms = ", ".join([
         f"'{m['key_column']}'::'{m['source_hex']}'"
         for m in merge_reference
     ])
     merge_script = f"MERGE < [{merge_terms}]"
     method.merge(merge_script)
Exemple #6
0
from pathlib import Path

import whyqd
from whyqd.parsers import CoreScript

SCHEMA_NAME = "/data/test_schema.json"
SOURCE_DIRECTORY = str(Path(__file__).resolve().parent)
SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME
SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
INPUT_DATA = [
    SOURCE_DIRECTORY + "/data/raw_E06000044_014_0.XLSX",
    SOURCE_DIRECTORY + "/data/raw_E06000044_014_1.XLSX",
    SOURCE_DIRECTORY + "/data/raw_E06000044_014_2.XLSX",
]


class TestMethod:
    def test_tutorial(self, tmp_path):
        """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date,
        actions and perform a merge, plus filter the final result."""
        DIRECTORY = str(tmp_path) + "/"
        CoreScript().check_path(DIRECTORY)
        method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
        method.set({"name": "test_method"})
        input_data = [{"path": d} for d in INPUT_DATA]
        method.add_data(source=input_data, get_row_count=True)
        # reorder
        input_order = [m.uuid.hex for m in method.get.input_data]
        input_order.reverse()
        method.reorder_data(order=input_order)
        # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]"
Exemple #7
0
 def test_tutorial(self, tmp_path):
     """World Bank urban population time-series, in wide format. Demonstrate define a schema,
     create a method, import data, perform actions and generate a schema-compliant output.
     From https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL"""
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     # DEFINE SCHEMA
     details = {
         "name":
         "urban_population",
         "title":
         "Urban population",
         "description":
         "Urban population refers to people living in urban areas as defined by national statistical offices. It is calculated using World Bank population estimates and urban ratios from the United Nations World Urbanization Prospects. Aggregation of urban and rural population may not add up to total population because of different country coverages.",
     }
     schema = whyqd.Schema()
     schema.set(details)
     fields = [
         {
             "name": "Indicator Code",
             "title": "Indicator Code",
             "type": "string",
             "description": "World Bank code reference for Indicator Name.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Country Name",
             "title": "Country Name",
             "type": "string",
             "description": "Official country names.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Country Code",
             "title": "Country Code",
             "type": "string",
             "description": "UN ISO 3-letter country code.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Indicator Name",
             "title": "Indicator Name",
             "type": "string",
             "description": "Indicator described in the data series.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Year",
             "title": "Year",
             "type": "year",
             "description": "Year of release.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Values",
             "title": "Values",
             "type": "number",
             "description": "Value for the Year and Indicator Name.",
             "constraints": {
                 "required": True
             },
         },
     ]
     for field in fields:
         schema.add_field(field)
     schema.save(DIRECTORY)
     # CREATE METHOD
     SCHEMA_SOURCE = DIRECTORY + "urban_population.json"
     SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "urban_population_method"})
     input_data = {"path": SOURCE_DATA}
     method.add_data(source=input_data)
     # Define actions
     schema_scripts = [
         "DEBLANK",
         "DEDUPE",
         "REBASE < [2]",
     ]
     source_data = method.get.input_data[0]
     method.add_actions(schema_scripts,
                        source_data.uuid.hex,
                        sheet_name=source_data.sheet_name)
     # df = method.transform(source_data)
     source_data = method.get.input_data[0]
     source_columns = [c.name for c in source_data.columns]
     schema_scripts = [
         f"PIVOT_LONGER > {source_columns[4:]}",
         "RENAME > 'indicator_code' < ['Indicator Code']",
         "RENAME > 'indicator_name' < ['Indicator Name']",
         "RENAME > 'country_code' < ['Country Code']",
         "RENAME > 'country_name' < ['Country Name']",
         "RENAME > 'year' < ['PIVOT_LONGER_names_idx_4']",
         "RENAME > 'values' < ['PIVOT_LONGER_values_idx_5']",
     ]
     method.add_actions(schema_scripts,
                        source_data.uuid.hex,
                        sheet_name=source_data.sheet_name)
     # Unambiguous deletion so they are not part of the research record
     for unwanted_data in method.get.input_data[1:]:
         method.remove_data(unwanted_data.uuid.hex,
                            sheet_name=unwanted_data.sheet_name)
     method.build()
     assert method.validate()
Exemple #8
0
 def test_load(self):
     s = whyqd.Schema(source=source)
     d = s.get.dict(by_alias=True, exclude_defaults=True, exclude_none=True, exclude_unset=True)
     assert d == data
Exemple #9
0
 def test_tutorial(self, tmp_path):
     """World Bank Human Development Report 2007 - 2008 in Cthulhu format. Demonstrate define a schema,
     create a method, import data, perform actions and generate a schema-compliant output.
     Copies at: https://github.com/whythawk/data-wrangling-and-validation/tree/master/data/lesson-spreadsheet"""
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     # DEFINE SCHEMA
     details = {
         "name": "human-development-report",
         "title": "UN Human Development Report 2007 - 2008",
         "description": """
             In 1990 the first Human Development Report introduced a new approach for
             advancing human wellbeing. Human development – or the human development approach - is about
             expanding the richness of human life, rather than simply the richness of the economy in which
             human beings live. It is an approach that is focused on people and their opportunities and choices.""",
     }
     schema = whyqd.Schema()
     schema.set(details)
     fields = [
         {
             "name": "year",
             "title": "Year",
             "type": "year",
             "description": "Year of release.",
         },
         {
             "name": "country_name",
             "title": "Country Name",
             "type": "string",
             "description": "Official country names.",
             "constraints": {"required": True},
         },
         {
             "name": "indicator_name",
             "title": "Indicator Name",
             "type": "string",
             "description": "Indicator described in the data series.",
         },
         {
             "name": "values",
             "title": "Values",
             "type": "number",
             "description": "Value for the Year and Indicator Name.",
             "constraints": {"required": True},
         },
         {
             "name": "reference",
             "title": "Reference",
             "type": "string",
             "description": "Reference to data source.",
         },
     ]
     for field in fields:
         schema.add_field(field)
     schema.save(DIRECTORY)
     # CREATE METHOD
     SCHEMA_SOURCE = DIRECTORY + "human-development-report.json"
     SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "human-development-report-method"})
     input_data = {"path": SOURCE_DATA}
     method.add_data(source=input_data)
     # Define actions
     schema_scripts = [
         "DEBLANK",
         "DEDUPE",
         "REBASE < [11]",
     ]
     source_data = method.get.input_data[0]
     method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name)
     # Get the index
     source_data = method.get.input_data[0]
     df = method.transform(source_data)
     schema_scripts = [
         f"DELETE_ROWS < {[int(i) for i in np.arange(144, df.index[-1]+1)]}",
         "RENAME_ALL > ['HDI rank', 'Country', 'Human poverty index (HPI-1) - Rank;;2008', 'Reference 1', 'Human poverty index (HPI-1) - Value (%);;2008', 'Probability at birth of not surviving to age 40 (% of cohort);;2000-05', 'Reference 2', 'Adult illiteracy rate (% aged 15 and older);;1995-2005', 'Reference 3', 'Population not using an improved water source (%);;2004', 'Reference 4', 'Children under weight for age (% under age 5);;1996-2005', 'Reference 5', 'Population below income poverty line (%) - $1 a day;;1990-2005', 'Reference 6', 'Population below income poverty line (%) - $2 a day;;1990-2005', 'Reference 7', 'Population below income poverty line (%) - National poverty line;;1990-2004', 'Reference 8', 'HPI-1 rank minus income poverty rank;;2008']",
         "PIVOT_CATEGORIES > ['HDI rank'] < [14,44,120]",
         "RENAME_NEW > 'HDI Category'::['PIVOT_CATEGORIES_idx_20_0']",
         "PIVOT_LONGER > = ['HDI rank', 'HDI Category', 'Human poverty index (HPI-1) - Rank;;2008', 'Human poverty index (HPI-1) - Value (%);;2008', 'Probability at birth of not surviving to age 40 (% of cohort);;2000-05', 'Adult illiteracy rate (% aged 15 and older);;1995-2005', 'Population not using an improved water source (%);;2004', 'Children under weight for age (% under age 5);;1996-2005', 'Population below income poverty line (%) - $1 a day;;1990-2005', 'Population below income poverty line (%) - $2 a day;;1990-2005', 'Population below income poverty line (%) - National poverty line;;1990-2004', 'HPI-1 rank minus income poverty rank;;2008']",
         "SPLIT > ';;'::['PIVOT_LONGER_names_idx_9']",
         "DEBLANK",
         "DEDUPE",
     ]
     method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name)
     # Get the column list
     reference_columns = [c.name for c in method.get.input_data[0].columns if c.name.startswith("Reference")]
     schema_scripts = [
         f"JOIN > 'reference' < {reference_columns}",
         "RENAME > 'indicator_name' < ['SPLIT_idx_11_0']",
         "RENAME > 'country_name' < ['Country']",
         "RENAME > 'year' < ['SPLIT_idx_12_1']",
         "RENAME > 'values' < ['PIVOT_LONGER_values_idx_10']",
     ]
     method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name)
     method.build()
     assert method.validate()