def test_find_replace(): from dataflows import find_replace f = Flow( data, find_replace([ dict(name='y', patterns=[ dict(find='a', replace='Apple'), dict(find='b', replace='Banana'), dict(find='c', replace='Coconut'), ]) ])) results, _, _ = f.results() y = [r['y'] for r in results[0]] assert y == ['Apple', 'Banana', 'Coconut']
'keys': { 'Date': r'\1' } }] extra_keys = [{'name': 'Date', 'type': 'string'}] extra_value = {'name': 'Case', 'type': 'number'} Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ 'name': 'Date', 'patterns': [{ 'find': '/', 'replace': '-' }] }]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_19-covid-Confirmed', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Confirmed={ 'name': 'Case', 'aggregate': 'first' })), join(source_name='time_series_19-covid-Recovered',
row["Long"] = row.get("Long", "-106.3468") yield {**expected, **row} Flow( load(f"{BASE_URL}{CONFIRMED}"), load(f"{BASE_URL}{RECOVERED}"), load(f"{BASE_URL}{DEATH}"), load(f"{BASE_URL}{CONFIRMED_US}"), load(f"{BASE_URL}{DEATH_US}"), checkpoint("load_data"), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ "name": "Date", "patterns": [{ "find": "/", "replace": "-" }] }]), to_normal_date, set_type("Date", type="date", format="%d-%m-%y", resources=None), set_type("Case", type="number", resources=None), join( source_name="time_series_covid19_confirmed_global", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_covid19_deaths_global", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Confirmed={ "name": "Case", "aggregate": "first"
def flow(parameters): return Flow( find_replace(parameters.get('fields', []), resources=parameters.get('resources')))
def data_pull_csv(): unpivoting_fields = [{ "name": r"([0-9]+\/[0-9]+\/[0-9]+)", "keys": { "Date": r"\1" } }] extra_keys = [{"name": "Date", "type": "string"}] extra_value = {"name": "Case", "type": "number"} Flow( load(f"{BASE_URL}{CONFIRMED}"), load(f"{BASE_URL}{RECOVERED}"), load(f"{BASE_URL}{DEATH}"), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ "name": "Date", "patterns": [{ "find": "/", "replace": "-" }] }]), to_normal_date, set_type("Date", type="date", format="%d-%m-%y", resources=None), set_type("Case", type="number", resources=None), join( source_name="time_series_19-covid-Confirmed", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_19-covid-Deaths", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Confirmed={ "name": "Case", "aggregate": "first" }), ), join( source_name="time_series_19-covid-Recovered", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_19-covid-Deaths", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Recovered={ "name": "Case", "aggregate": "first" }), ), add_computed_field( target={ "name": "Deaths", "type": "number" }, operation="format", with_="{Case}", ), delete_fields(["Case"]), update_resource( "time_series_19-covid-Deaths", name="time-series-19-covid-combined", path=RAW_OUTPUT_CSV, ), dump_to_path(), ).results()[0]
skip_rows=[ i + 1 for i in range(6 + datetime.datetime.now().year - 1966 + 3) ] + [-1], headers=[ 'Year', 'Number (thousands)', 'Lowest', 'Second', 'Third', 'Fourth', 'Top 5 percent' ], ), find_replace(fields=[{ 'name': 'Year', 'patterns': [{ 'find': '(\s?\(\d+\))|(\.0)', 'replace': '' }] }, { 'name': 'Fourth', 'patterns': [{ 'find': '\+|', 'replace': '' }] }], resources=0), update_resource( 0, **{ 'name': 'household-income-us-historical', 'path': 'data/household-income-us-historical.csv', 'dpp:streaming': True }), set_type('Year', type='year'), set_type('^(?!Y).+', type='number'),
{% if input == 'sql' %} load('{{input_url}}', table='{{input_db_table}}'), {% endif %} {% if input == 'other' %} {% endif %} # Process them (if necessary) {% if 'sort' in processing %} sort_rows('{field_name}'), # Key is a Python format string or a list of field names {% endif %} {% if 'filter' in processing %} filter_rows(), {% endif %} {% if 'find_replace' in processing %} find_replace([ dict(name='field_name', patterns=[ dict(find='re-pattern-to-find', replace='re-pattern-to-replace-with'), ]) ]), {% endif %} {% if 'delete_fields' in processing %} delete_fields(['field_name']), # Pass a list of field names to delete from the data {% endif %} {% if 'set_type' in processing %} set_type('field_name', type='number', constraints=dict(minimum=3)), # There are quite a few options you can use here # Take a look at https://frictionlessdata.io/specs/table-schema/ # Or you can simply use validate() here instead {% endif %} {% if 'unpivot' in processing %} unpivot(unpivot_fields, extra_keys, extra_value), # See documentation on the meaning of each of these parameters {% endif %} {% if 'custom' in processing %}