Ejemplo n.º 1
0
 def test_create(self, tmp_path):
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "test_method"})
     input_data = [{"path": d} for d in INPUT_DATA]
     method.add_data(source=input_data)
     method.save(created_by="Gavin Chait", hide_uuid=True)
     test_source = DIRECTORY + "test_method.json"
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA, method=CoreScript().load_json(test_source))
     assert METHOD["input_data"][0]["checksum"] == method.get.input_data[0].checksum
     assert METHOD["input_data"][1]["checksum"] == method.get.input_data[1].checksum
     assert METHOD["input_data"][2]["checksum"] == method.get.input_data[2].checksum
Ejemplo n.º 2
0
def _test_script_world_bank(DIRECTORY, script, rebase=False):
    """World Bank urban population time-series, in wide format. Demonstrate define a schema,
    create a method, import data, perform actions and generate a schema-compliant output.
    From https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL"""
    SCHEMA_NAME = "/data/urban_population.json"
    SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME
    SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
    SOURCE_DATA = SOURCE_DIRECTORY + "/data/working_test_world_bank_data.xlsx"
    method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
    method.set({"name": "urban_population_method"})
    method.add_data(source=SOURCE_DATA)
    scripts = []
    if rebase:
        scripts = [
            "DEBLANK",
            "DEDUPE",
            "REBASE < [2]",
        ]
    # Run the test script
    scripts.append(script)
    source_data = method.get.input_data[0]
    method.add_actions(scripts,
                       source_data.uuid.hex,
                       sheet_name=source_data.sheet_name)
    method.transform(source_data)
    return True
Ejemplo n.º 3
0
 def test_merge_and_build(self, tmp_path):
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "test_method"})
     input_data = [{"path": d} for d in INPUT_DATA]
     method.add_data(source=input_data)
     # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]"
     merge_reference = [
         {"source_hex": method.get.input_data[2].uuid.hex, "key_column": "Property ref no"},
         {"source_hex": method.get.input_data[1].uuid.hex, "key_column": "Property Reference Number"},
         {"source_hex": method.get.input_data[0].uuid.hex, "key_column": "Property Reference Number"},
     ]
     merge_terms = ", ".join([f"'{m['key_column']}'::'{m['source_hex']}'" for m in merge_reference])
     merge_script = f"MERGE < [{merge_terms}]"
     method.merge(merge_script)
     schema_scripts = whyqd.parsers.LegacyScript().parse_legacy_method(
         version="1", schema=SCHEMA, source_path=METHOD_SOURCE_V1
     )
     source_data = method.get.working_data
     method.add_actions(schema_scripts, source_data.uuid.hex)
     method.build()
     citation = {
         "author": "Gavin Chait",
         "month": "feb",
         "year": 2020,
         "title": "Portsmouth City Council normalised database of commercial ratepayers",
         "url": "https://github.com/whythawk/whyqd/tree/master/tests/data",
     }
     method.set_citation(citation)
     test_citation = method.get_citation()
     del test_citation["input_sources"]
     del test_citation["restructured_data"]
     assert test_citation == citation
     assert method.save(created_by="Gavin Chait", hide_uuid=True)
Ejemplo n.º 4
0
 def test_tutorial(self, tmp_path):
     """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date,
     actions and perform a merge, plus filter the final result."""
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "test_method"})
     input_data = [{"path": d} for d in INPUT_DATA]
     method.add_data(source=input_data, get_row_count=True)
     # reorder
     input_order = [m.uuid.hex for m in method.get.input_data]
     input_order.reverse()
     method.reorder_data(order=input_order)
     # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]"
     merge_reference = [
         {
             "source_hex": method.get.input_data[0].uuid.hex,
             "key_column": "Property ref no"
         },
         {
             "source_hex": method.get.input_data[1].uuid.hex,
             "key_column": "Property Reference Number"
         },
         {
             "source_hex": method.get.input_data[2].uuid.hex,
             "key_column": "Property Reference Number"
         },
     ]
     merge_terms = ", ".join([
         f"'{m['key_column']}'::'{m['source_hex']}'"
         for m in merge_reference
     ])
     merge_script = f"MERGE < [{merge_terms}]"
     method.merge(merge_script)
     schema_scripts = [
         "NEW > 'la_code' < ['E06000044']",
         "RENAME > 'ba_ref' < ['Property ref no']",
         "ORDER > 'prop_ba_rates' < ['Current Rateable Value_x', 'Current Rateable Value_y', 'Current Rateable Value']",
         "ORDER > 'occupant_name' < ['Primary Liable party name_x', 'Primary Liable party name_y', 'Primary Liable party name']",
         "ORDER > 'postcode' < ['Full Property Address_x', 'Full Property Address_y', 'Full Property Address']",
         "CATEGORISE > 'occupation_state' < [+ 'Current Property Exemption Code', + 'Current Relief Type']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state'::False < 'Current Property Exemption Code'::['EPRN', 'EPRI', 'VOID', 'EPCH', 'LIQUIDATE', 'DECEASED', 'PROHIBITED', 'BANKRUPT']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state'::False < 'Current Relief Type'::['Empty Property Rate Non-Industrial', 'Empty Property Rate Industrial', 'Empty Property Rate Charitable']",
         "CATEGORISE > 'occupation_state_reliefs' < [+ 'Current Property Exemption Code', + 'Current Relief Type']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'small_business' < 'Current Relief Type'::['Small Business Relief England', 'Sbre Extension For 12 Months', 'Supporting Small Business Relief']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'enterprise_zone' < 'Current Property Exemption Code'::['INDUSTRIAL']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'vacancy' < 'Current Property Exemption Code'::['EPRN', 'EPRI', 'VOID', 'EPCH', 'LIQUIDATE', 'DECEASED', 'PROHIBITED', 'BANKRUPT']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'vacancy' < 'Current Relief Type'::['Empty Property Rate Non-Industrial', 'Empty Property Rate Industrial', 'Empty Property Rate Charitable']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'retail' < 'Current Relief Type'::['Retail Discount']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'exempt' < 'Current Property Exemption Code'::['C', 'LOW RV', 'LAND']",
         "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'other' < 'Current Relief Type'::['Sports Club (Registered CASC)', 'Mandatory']",
         "ORDER_NEW > 'occupation_state_date' < ['Current Prop Exemption Start Date' + 'Current Prop Exemption Start Date', 'Current Relief Award Start Date' + 'Current Relief Award Start Date', 'Account Start date_x' + 'Account Start date_x', 'Account Start date_y' + 'Account Start date_y']",
     ]
     source_data = method.get.working_data
     method.add_actions(schema_scripts, source_data.uuid.hex)
     filter_script = "FILTER_AFTER > 'occupation_state_date'::'2010-01-01'"
     method.add_actions(filter_script, source_data.uuid.hex)
     method.build()
     method.validate()
Ejemplo n.º 5
0
 def test_method_conversion_v1(self, tmp_path):
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "test_method"})
     input_data = {"path": INPUT_DATA_ONESHOT}
     method.add_data(source=input_data)
     schema_scripts = whyqd.parsers.LegacyScript().parse_legacy_method(
         version="1", schema=SCHEMA, source_path=METHOD_SOURCE_V1
     )
     source_data = method.get.input_data[0]
     method.add_actions(schema_scripts, source_data.uuid.hex)
     method.transform(source_data)
     method.save(created_by="Gavin Chait", hide_uuid=True)
Ejemplo n.º 6
0
def _test_script_portsmouth(DIRECTORY, script, test_filter=False):
    """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date,
    actions and perform a merge, plus filter the final result."""
    SCHEMA_NAME = "/data/test_schema.json"
    SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME
    SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
    INPUT_DATA = SOURCE_DIRECTORY + "/data/working_test_data.xlsx"
    if test_filter:
        INPUT_DATA = SOURCE_DIRECTORY + "/data/restructured_test_data.xlsx"
    method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
    method.set({"name": "test_method"})
    method.add_data(source=INPUT_DATA)
    source_data = method.get.input_data[0]
    method.add_actions(script,
                       source_data.uuid.hex,
                       sheet_name=source_data.sheet_name)
    method.transform(source_data)
    return True
Ejemplo n.º 7
0
 def test_merge(self, tmp_path):
     """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date,
     actions and perform a merge, plus filter the final result."""
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     SCHEMA_NAME = "/data/test_schema.json"
     SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME
     SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
     INPUT_DATA = [
         SOURCE_DIRECTORY + "/data/raw_E06000044_014_0.XLSX",
         SOURCE_DIRECTORY + "/data/raw_E06000044_014_1.XLSX",
         SOURCE_DIRECTORY + "/data/raw_E06000044_014_2.XLSX",
     ]
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "test_method"})
     input_data = [{"path": d} for d in INPUT_DATA]
     method.add_data(source=input_data)
     # reorder
     input_order = [m.uuid.hex for m in method.get.input_data]
     input_order.reverse()
     method.reorder_data(order=input_order)
     # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]"
     merge_reference = [
         {
             "source_hex": method.get.input_data[0].uuid.hex,
             "key_column": "Property ref no"
         },
         {
             "source_hex": method.get.input_data[1].uuid.hex,
             "key_column": "Property Reference Number"
         },
         {
             "source_hex": method.get.input_data[2].uuid.hex,
             "key_column": "Property Reference Number"
         },
     ]
     merge_terms = ", ".join([
         f"'{m['key_column']}'::'{m['source_hex']}'"
         for m in merge_reference
     ])
     merge_script = f"MERGE < [{merge_terms}]"
     method.merge(merge_script)
Ejemplo n.º 8
0
 def test_tutorial(self, tmp_path):
     """World Bank urban population time-series, in wide format. Demonstrate define a schema,
     create a method, import data, perform actions and generate a schema-compliant output.
     From https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL"""
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     # DEFINE SCHEMA
     details = {
         "name":
         "urban_population",
         "title":
         "Urban population",
         "description":
         "Urban population refers to people living in urban areas as defined by national statistical offices. It is calculated using World Bank population estimates and urban ratios from the United Nations World Urbanization Prospects. Aggregation of urban and rural population may not add up to total population because of different country coverages.",
     }
     schema = whyqd.Schema()
     schema.set(details)
     fields = [
         {
             "name": "Indicator Code",
             "title": "Indicator Code",
             "type": "string",
             "description": "World Bank code reference for Indicator Name.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Country Name",
             "title": "Country Name",
             "type": "string",
             "description": "Official country names.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Country Code",
             "title": "Country Code",
             "type": "string",
             "description": "UN ISO 3-letter country code.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Indicator Name",
             "title": "Indicator Name",
             "type": "string",
             "description": "Indicator described in the data series.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Year",
             "title": "Year",
             "type": "year",
             "description": "Year of release.",
             "constraints": {
                 "required": True
             },
         },
         {
             "name": "Values",
             "title": "Values",
             "type": "number",
             "description": "Value for the Year and Indicator Name.",
             "constraints": {
                 "required": True
             },
         },
     ]
     for field in fields:
         schema.add_field(field)
     schema.save(DIRECTORY)
     # CREATE METHOD
     SCHEMA_SOURCE = DIRECTORY + "urban_population.json"
     SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "urban_population_method"})
     input_data = {"path": SOURCE_DATA}
     method.add_data(source=input_data)
     # Define actions
     schema_scripts = [
         "DEBLANK",
         "DEDUPE",
         "REBASE < [2]",
     ]
     source_data = method.get.input_data[0]
     method.add_actions(schema_scripts,
                        source_data.uuid.hex,
                        sheet_name=source_data.sheet_name)
     # df = method.transform(source_data)
     source_data = method.get.input_data[0]
     source_columns = [c.name for c in source_data.columns]
     schema_scripts = [
         f"PIVOT_LONGER > {source_columns[4:]}",
         "RENAME > 'indicator_code' < ['Indicator Code']",
         "RENAME > 'indicator_name' < ['Indicator Name']",
         "RENAME > 'country_code' < ['Country Code']",
         "RENAME > 'country_name' < ['Country Name']",
         "RENAME > 'year' < ['PIVOT_LONGER_names_idx_4']",
         "RENAME > 'values' < ['PIVOT_LONGER_values_idx_5']",
     ]
     method.add_actions(schema_scripts,
                        source_data.uuid.hex,
                        sheet_name=source_data.sheet_name)
     # Unambiguous deletion so they are not part of the research record
     for unwanted_data in method.get.input_data[1:]:
         method.remove_data(unwanted_data.uuid.hex,
                            sheet_name=unwanted_data.sheet_name)
     method.build()
     assert method.validate()
Ejemplo n.º 9
0
 def test_tutorial(self, tmp_path):
     """World Bank Human Development Report 2007 - 2008 in Cthulhu format. Demonstrate define a schema,
     create a method, import data, perform actions and generate a schema-compliant output.
     Copies at: https://github.com/whythawk/data-wrangling-and-validation/tree/master/data/lesson-spreadsheet"""
     DIRECTORY = str(tmp_path) + "/"
     CoreScript().check_path(DIRECTORY)
     # DEFINE SCHEMA
     details = {
         "name": "human-development-report",
         "title": "UN Human Development Report 2007 - 2008",
         "description": """
             In 1990 the first Human Development Report introduced a new approach for
             advancing human wellbeing. Human development – or the human development approach - is about
             expanding the richness of human life, rather than simply the richness of the economy in which
             human beings live. It is an approach that is focused on people and their opportunities and choices.""",
     }
     schema = whyqd.Schema()
     schema.set(details)
     fields = [
         {
             "name": "year",
             "title": "Year",
             "type": "year",
             "description": "Year of release.",
         },
         {
             "name": "country_name",
             "title": "Country Name",
             "type": "string",
             "description": "Official country names.",
             "constraints": {"required": True},
         },
         {
             "name": "indicator_name",
             "title": "Indicator Name",
             "type": "string",
             "description": "Indicator described in the data series.",
         },
         {
             "name": "values",
             "title": "Values",
             "type": "number",
             "description": "Value for the Year and Indicator Name.",
             "constraints": {"required": True},
         },
         {
             "name": "reference",
             "title": "Reference",
             "type": "string",
             "description": "Reference to data source.",
         },
     ]
     for field in fields:
         schema.add_field(field)
     schema.save(DIRECTORY)
     # CREATE METHOD
     SCHEMA_SOURCE = DIRECTORY + "human-development-report.json"
     SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE)
     method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA)
     method.set({"name": "human-development-report-method"})
     input_data = {"path": SOURCE_DATA}
     method.add_data(source=input_data)
     # Define actions
     schema_scripts = [
         "DEBLANK",
         "DEDUPE",
         "REBASE < [11]",
     ]
     source_data = method.get.input_data[0]
     method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name)
     # Get the index
     source_data = method.get.input_data[0]
     df = method.transform(source_data)
     schema_scripts = [
         f"DELETE_ROWS < {[int(i) for i in np.arange(144, df.index[-1]+1)]}",
         "RENAME_ALL > ['HDI rank', 'Country', 'Human poverty index (HPI-1) - Rank;;2008', 'Reference 1', 'Human poverty index (HPI-1) - Value (%);;2008', 'Probability at birth of not surviving to age 40 (% of cohort);;2000-05', 'Reference 2', 'Adult illiteracy rate (% aged 15 and older);;1995-2005', 'Reference 3', 'Population not using an improved water source (%);;2004', 'Reference 4', 'Children under weight for age (% under age 5);;1996-2005', 'Reference 5', 'Population below income poverty line (%) - $1 a day;;1990-2005', 'Reference 6', 'Population below income poverty line (%) - $2 a day;;1990-2005', 'Reference 7', 'Population below income poverty line (%) - National poverty line;;1990-2004', 'Reference 8', 'HPI-1 rank minus income poverty rank;;2008']",
         "PIVOT_CATEGORIES > ['HDI rank'] < [14,44,120]",
         "RENAME_NEW > 'HDI Category'::['PIVOT_CATEGORIES_idx_20_0']",
         "PIVOT_LONGER > = ['HDI rank', 'HDI Category', 'Human poverty index (HPI-1) - Rank;;2008', 'Human poverty index (HPI-1) - Value (%);;2008', 'Probability at birth of not surviving to age 40 (% of cohort);;2000-05', 'Adult illiteracy rate (% aged 15 and older);;1995-2005', 'Population not using an improved water source (%);;2004', 'Children under weight for age (% under age 5);;1996-2005', 'Population below income poverty line (%) - $1 a day;;1990-2005', 'Population below income poverty line (%) - $2 a day;;1990-2005', 'Population below income poverty line (%) - National poverty line;;1990-2004', 'HPI-1 rank minus income poverty rank;;2008']",
         "SPLIT > ';;'::['PIVOT_LONGER_names_idx_9']",
         "DEBLANK",
         "DEDUPE",
     ]
     method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name)
     # Get the column list
     reference_columns = [c.name for c in method.get.input_data[0].columns if c.name.startswith("Reference")]
     schema_scripts = [
         f"JOIN > 'reference' < {reference_columns}",
         "RENAME > 'indicator_name' < ['SPLIT_idx_11_0']",
         "RENAME > 'country_name' < ['Country']",
         "RENAME > 'year' < ['SPLIT_idx_12_1']",
         "RENAME > 'values' < ['PIVOT_LONGER_values_idx_10']",
     ]
     method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name)
     method.build()
     assert method.validate()