Exemple #1
0
def test_find_replace():
    from dataflows import find_replace
    f = Flow(
        data,
        find_replace([
            dict(name='y',
                 patterns=[
                     dict(find='a', replace='Apple'),
                     dict(find='b', replace='Banana'),
                     dict(find='c', replace='Coconut'),
                 ])
        ]))
    results, _, _ = f.results()
    y = [r['y'] for r in results[0]]
    assert y == ['Apple', 'Banana', 'Coconut']
    'keys': {
        'Date': r'\1'
    }
}]

extra_keys = [{'name': 'Date', 'type': 'string'}]
extra_value = {'name': 'Case', 'type': 'number'}

Flow(
    load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'),
    load(f'{BASE_URL}{DEATH}'),
    unpivot(unpivoting_fields, extra_keys, extra_value),
    find_replace([{
        'name': 'Date',
        'patterns': [{
            'find': '/',
            'replace': '-'
        }]
    }]), to_normal_date,
    set_type('Date', type='date', format='%d-%m-%y', resources=None),
    set_type('Case', type='number', resources=None),
    join(source_name='time_series_19-covid-Confirmed',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_19-covid-Deaths',
         target_key=['Province/State', 'Country/Region', 'Date'],
         fields=dict(Confirmed={
             'name': 'Case',
             'aggregate': 'first'
         })),
    join(source_name='time_series_19-covid-Recovered',
Exemple #3
0
            row["Long"] = row.get("Long", "-106.3468")
        yield {**expected, **row}


Flow(
    load(f"{BASE_URL}{CONFIRMED}"),
    load(f"{BASE_URL}{RECOVERED}"),
    load(f"{BASE_URL}{DEATH}"),
    load(f"{BASE_URL}{CONFIRMED_US}"),
    load(f"{BASE_URL}{DEATH_US}"),
    checkpoint("load_data"),
    unpivot(unpivoting_fields, extra_keys, extra_value),
    find_replace([{
        "name": "Date",
        "patterns": [{
            "find": "/",
            "replace": "-"
        }]
    }]),
    to_normal_date,
    set_type("Date", type="date", format="%d-%m-%y", resources=None),
    set_type("Case", type="number", resources=None),
    join(
        source_name="time_series_covid19_confirmed_global",
        source_key=["Province/State", "Country/Region", "Date"],
        source_delete=True,
        target_name="time_series_covid19_deaths_global",
        target_key=["Province/State", "Country/Region", "Date"],
        fields=dict(Confirmed={
            "name": "Case",
            "aggregate": "first"
Exemple #4
0
def flow(parameters):
    return Flow(
        find_replace(parameters.get('fields', []),
                     resources=parameters.get('resources')))
Exemple #5
0
def data_pull_csv():
    unpivoting_fields = [{
        "name": r"([0-9]+\/[0-9]+\/[0-9]+)",
        "keys": {
            "Date": r"\1"
        }
    }]

    extra_keys = [{"name": "Date", "type": "string"}]
    extra_value = {"name": "Case", "type": "number"}

    Flow(
        load(f"{BASE_URL}{CONFIRMED}"),
        load(f"{BASE_URL}{RECOVERED}"),
        load(f"{BASE_URL}{DEATH}"),
        unpivot(unpivoting_fields, extra_keys, extra_value),
        find_replace([{
            "name": "Date",
            "patterns": [{
                "find": "/",
                "replace": "-"
            }]
        }]),
        to_normal_date,
        set_type("Date", type="date", format="%d-%m-%y", resources=None),
        set_type("Case", type="number", resources=None),
        join(
            source_name="time_series_19-covid-Confirmed",
            source_key=["Province/State", "Country/Region", "Date"],
            source_delete=True,
            target_name="time_series_19-covid-Deaths",
            target_key=["Province/State", "Country/Region", "Date"],
            fields=dict(Confirmed={
                "name": "Case",
                "aggregate": "first"
            }),
        ),
        join(
            source_name="time_series_19-covid-Recovered",
            source_key=["Province/State", "Country/Region", "Date"],
            source_delete=True,
            target_name="time_series_19-covid-Deaths",
            target_key=["Province/State", "Country/Region", "Date"],
            fields=dict(Recovered={
                "name": "Case",
                "aggregate": "first"
            }),
        ),
        add_computed_field(
            target={
                "name": "Deaths",
                "type": "number"
            },
            operation="format",
            with_="{Case}",
        ),
        delete_fields(["Case"]),
        update_resource(
            "time_series_19-covid-Deaths",
            name="time-series-19-covid-combined",
            path=RAW_OUTPUT_CSV,
        ),
        dump_to_path(),
    ).results()[0]
     skip_rows=[
         i + 1 for i in range(6 + datetime.datetime.now().year - 1966 + 3)
     ] + [-1],
     headers=[
         'Year', 'Number (thousands)', 'Lowest', 'Second', 'Third',
         'Fourth', 'Top 5 percent'
     ],
 ),
 find_replace(fields=[{
     'name':
     'Year',
     'patterns': [{
         'find': '(\s?\(\d+\))|(\.0)',
         'replace': ''
     }]
 }, {
     'name': 'Fourth',
     'patterns': [{
         'find': '\+|',
         'replace': ''
     }]
 }],
              resources=0),
 update_resource(
     0, **{
         'name': 'household-income-us-historical',
         'path': 'data/household-income-us-historical.csv',
         'dpp:streaming': True
     }),
 set_type('Year', type='year'),
 set_type('^(?!Y).+', type='number'),
Exemple #7
0
 {% if input == 'sql' %}
 load('{{input_url}}', table='{{input_db_table}}'),
 {% endif %}
 {% if input == 'other' %}
 {% endif %}
 # Process them (if necessary)
 {% if 'sort' in processing %}
 sort_rows('{field_name}'),  # Key is a Python format string or a list of field names
 {% endif %}
 {% if 'filter' in processing %}
 filter_rows(),
 {% endif %}
 {% if 'find_replace' in processing %}
 find_replace([
     dict(name='field_name',
          patterns=[
              dict(find='re-pattern-to-find', replace='re-pattern-to-replace-with'),                     
          ])
 ]),
 {% endif %}
 {% if 'delete_fields' in processing %}
 delete_fields(['field_name']),  # Pass a list of field names to delete from the data
 {% endif %}
 {% if 'set_type' in processing %}
 set_type('field_name', type='number', constraints=dict(minimum=3)),  # There are quite a few options you can use here
                                                                      # Take a look at https://frictionlessdata.io/specs/table-schema/
 # Or you can simply use validate() here instead                                                                             
 {% endif %}
 {% if 'unpivot' in processing %}
 unpivot(unpivot_fields, extra_keys, extra_value),  # See documentation on the meaning of each of these parameters
 {% endif %}
 {% if 'custom' in processing %}