Ejemplo n.º 1
0
def test_sort_reverse_many_rows():
    from dataflows import sort_rows

    f = Flow(
        ({
            'a': i,
            'b': i % 5
        } for i in range(1000)),
        sort_rows(key='{b}{a}', reverse=True, batch_size=0),
    )
    results, _, _ = f.results()
    results = results[0]
    assert results[0:2] == [{'a': 999, 'b': 4}, {'a': 994, 'b': 4}]
    assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}]
Ejemplo n.º 2
0
def test_fullouter_join_dump_different_keys():
    from dataflows import Flow, join, dump_to_path
    from decimal import Decimal

    data1 = [
        {
            "col1": 1.531,
            "col2": "hello"
        },
        {
            "col1": 1.132,
            "col2": "goodbye"
        },
    ]
    data2 = [
        {
            "colA": 1.531,
            "colB": "123"
        },
        {
            "colA": 1.132,
            "colB": 1.132
        },
    ]
    f = Flow(
        data1,
        data2,
        join("res_1", ["col1"],
             "res_2", ["colA"],
             {"col2": {
                 "name": "col2",
                 "aggregate": "first"
             }},
             mode="full-outer"),
        dump_to_path(out_path='out/test_join_dump'),
    )
    results = f.results()[0][0]
    assert results == [
        {
            'colA': Decimal('1.531'),
            'col2': 'hello',
            'colB': '123'
        },
        {
            'colA': Decimal('1.132'),
            'col2': 'goodbye',
            'colB': 1.132
        },
    ]
Ejemplo n.º 3
0
def test_rename_resource():
    from dataflows import Flow, printer, PackageWrapper, ResourceWrapper

    def rename(package: PackageWrapper):
        package.pkg.descriptor['resources'][0]['name'] = 'renamed'
        yield package.pkg
        res_iter = iter(package)
        first: ResourceWrapper = next(res_iter)
        yield first.it
        yield from package

    f = Flow(({'a': x} for x in range(10)), rename, printer())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert dp.descriptor['resources'][0]['name'] == 'renamed'
Ejemplo n.º 4
0
def test_find_replace():
    from dataflows import find_replace
    f = Flow(
        data,
        find_replace([
            dict(name='y',
                 patterns=[
                     dict(find='a', replace='Apple'),
                     dict(find='b', replace='Banana'),
                     dict(find='c', replace='Coconut'),
                 ])
        ]))
    results, _, _ = f.results()
    y = [r['y'] for r in results[0]]
    assert y == ['Apple', 'Banana', 'Coconut']
Ejemplo n.º 5
0
def test_set_type_resources():
    from dataflows import Flow, set_type, validate

    f = Flow([dict(a=str(i)) for i in range(10)],
             [dict(b=str(i)) for i in range(10)],
             [dict(c='0_' + str(i)) for i in range(10)],
             set_type('a', resources='res_[1]', type='integer'),
             set_type('b', resources=['res_2'], type='integer'),
             set_type('[cd]', resources=-1, type='number', groupChar='_'),
             validate())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert results[0][1]['a'] == 1
    assert results[1][3]['b'] == 3
    assert results[2][8]['c'] == 8.0
Ejemplo n.º 6
0
def test_update_resource():
    from dataflows import Flow, printer, update_resource

    f = Flow(
        *[
            ({k: x} for x in range(10))
            for k in 'abcdef'
        ],
        update_resource(['res_1', 'res_3', 'res_5'], source='thewild'),
        printer()
    )
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert dp.descriptor['resources'][0]['source'] == 'thewild'
    assert dp.descriptor['resources'][2]['source'] == 'thewild'
    assert dp.descriptor['resources'][4]['source'] == 'thewild'
Ejemplo n.º 7
0
def test_add_computed_field():
    from dataflows import add_computed_field
    f = Flow(
        data,
        add_computed_field([
            dict(source=['x', 'x'], target='xx', operation='multiply'),
            dict(target='f', operation='format', with_='{y} - {x}')
        ]))
    results, dp, stats = f.results()
    results = list(results[0])

    xx = [x['xx'] for x in results]
    f = [x['f'] for x in results]

    assert xx == [1, 4, 9]
    assert f == ['a - 1', 'b - 2', 'c - 3']
Ejemplo n.º 8
0
def test_filter_rows_callable():
    from dataflows import filter_rows

    f = Flow(
        [
            {'a': 1, 'b': 3},
            {'a': 2, 'b': 3},
            {'a': 1, 'b': 4},
            {'a': 2, 'b': 4},
        ],
        filter_rows(condition=lambda row: row['a'] > 1 and row['b'] < 4),
    )
    results, _, _ = f.results()
    assert results[0][0] == dict(a=2, b=3)
    assert len(results[0]) == 1
    assert len(results) == 1
Ejemplo n.º 9
0
def test_example_1():
    from dataflows import Flow

    data = [
        {'data': 'Hello'},
        {'data': 'World'}
    ]


    def lowerData(row):
        row['data'] = row['data'].lower()

    f = Flow(
        data,
        lowerData
    )
    data, *_ = f.results()
Ejemplo n.º 10
0
def test_concatenate_multifield():
    from dataflows import concatenate

    f = Flow([
        {
            'a': 1,
            'b': 2,
            'c': None
        },
        {
            'a': 2,
            'b': None,
            'c': 3
        },
        {
            'a': 3,
            'c': 4
        },
        {
            'a': 3,
            'b': 6,
            'c': 4
        },
    ], concatenate({
        'f1': ['a'],
        'f2': ['b', 'c'],
    }))
    results, _, _ = f.results()
    assert results[0] == [
        {
            'f1': 1,
            'f2': 2
        },
        {
            'f1': 2,
            'f2': 3
        },
        {
            'f1': 3,
            'f2': 4
        },
        {
            'f1': 3,
            'f2': 4
        },
    ]
Ejemplo n.º 11
0
def test_filter_rows():
    from dataflows import filter_rows

    f = Flow(
        [
            {'a': 1, 'b': 3},
            {'a': 2, 'b': 3},
            {'a': 1, 'b': 4},
            {'a': 2, 'b': 4},
        ],
        filter_rows(equals=[dict(a=1)]),
        filter_rows(not_equals=[dict(b=3)]),
    )
    results, _, _ = f.results()
    assert results[0][0] == dict(a=1, b=4)
    assert len(results[0]) == 1
    assert len(results) == 1
Ejemplo n.º 12
0
def test_duplicate():
    from dataflows import duplicate

    a = [
            {'a': 1, 'b': 3},
            {'a': 2, 'b': 3},
            {'a': 3, 'b': 1},
            {'a': 4, 'b': 1},
        ]

    f = Flow(
        a,
        duplicate(),
    )
    results, _, _ = f.results()
    assert list(results[0]) == a
    assert list(results[1]) == a
Ejemplo n.º 13
0
def test_load_limit_rows():
    from dataflows import load
    flow = Flow(load('data/beatles.csv', limit_rows=3))
    data = flow.results()[0]
    assert data == [[
        {
            'name': 'john',
            'instrument': 'guitar'
        },
        {
            'name': 'paul',
            'instrument': 'bass'
        },
        {
            'name': 'george',
            'instrument': 'guitar'
        },
    ]]
Ejemplo n.º 14
0
def test_delete_fields_regex():
    from dataflows import load, delete_fields
    flow = Flow(
        load('data/regex.csv'),
        delete_fields(['temperature (24h)'], regex=False),
    )
    data = flow.results()[0]
    assert data == [[
        {
            'city': 'london'
        },
        {
            'city': 'paris'
        },
        {
            'city': 'rome'
        },
    ]]
Ejemplo n.º 15
0
def test_add_field():
    from dataflows import Flow, add_field
    f = Flow(
        (dict(a=i) for i in range(3)),
        add_field('b', 'string', 'b'),
        add_field('c', 'number'),
        add_field('d', 'boolean', title='mybool'),
    )
    results, dp, _ = f.results()
    assert results == [[{
        'a': 0,
        'b': 'b',
        'c': None,
        'd': None
    }, {
        'a': 1,
        'b': 'b',
        'c': None,
        'd': None
    }, {
        'a': 2,
        'b': 'b',
        'c': None,
        'd': None
    }]]
    assert dp.descriptor == \
         {'profile': 'data-package',
          'resources': [{'name': 'res_1',
                         'path': 'res_1.csv',
                         'profile': 'tabular-data-resource',
                         'schema': {'fields': [{'format': 'default',
                                                'name': 'a',
                                                'type': 'integer'},
                                               {'format': 'default',
                                                'name': 'b',
                                                'type': 'string'},
                                               {'format': 'default',
                                                'name': 'c',
                                                'type': 'number'},
                                               {'format': 'default',
                                                'name': 'd',
                                                'title': 'mybool',
                                                'type': 'boolean'}],
                                    'missingValues': ['']}}]}
Ejemplo n.º 16
0
def test_unpivot():
    from dataflows import unpivot
    f = Flow(
        data,
        unpivot(
            [
                dict(
                    name='x',
                    keys=dict(
                        field='x-value'
                    )
                ),
                dict(
                    name='y',
                    keys=dict(
                        field='y-value'
                    )
                ),
            ],
            [
                dict(
                    name='field',
                    type='string'
                )
            ],
            dict(
                name='the-value',
                type='any'
            )
        )
    )
    results, _, _ = f.results()
    assert results[0] == [
        dict(zip(['field', 'the-value'], r))
        for r in
        [
            ['x-value', 1],
            ['y-value', 'a'],
            ['x-value', 2],
            ['y-value', 'b'],
            ['x-value', 3],
            ['y-value', 'c'],
        ]
    ]
Ejemplo n.º 17
0
def test_sort_rows():
    from dataflows import sort_rows

    f = Flow(
        [
            {
                'a': 1,
                'b': 3
            },
            {
                'a': 2,
                'b': 3
            },
            {
                'a': 3,
                'b': 1
            },
            {
                'a': 4,
                'b': 1
            },
        ],
        sort_rows(key='{b}{a}'),
    )
    results, _, _ = f.results()
    assert list(results[0]) == [
        {
            'a': 3,
            'b': 1
        },
        {
            'a': 4,
            'b': 1
        },
        {
            'a': 1,
            'b': 3
        },
        {
            'a': 2,
            'b': 3
        },
    ]
Ejemplo n.º 18
0
def test_add_computed_field_func():
    from dataflows import add_computed_field

    data = [dict(x=i) for i in range(3)]

    f = Flow(
        data,
        add_computed_field([
            dict(target=dict(name='sq', type='integer'),
                 operation=lambda row: row['x']**2),
            dict(target='f', operation='format', with_='{x} - {x}')
        ]))
    results, *_ = f.results()
    results = list(results[0])

    assert results == [
        dict(x=0, sq=0, f='0 - 0'),
        dict(x=1, sq=1, f='1 - 1'),
        dict(x=2, sq=4, f='2 - 2'),
    ]
Ejemplo n.º 19
0
def test_join_full_outer():
    from dataflows import load, set_type, join
    flow = Flow(
        load('data/population.csv'),
        load('data/cities.csv'),
        join(
            source_name='population',
            source_key=['id'],
            target_name='cities',
            target_key=['id'],
            fields={'population': {
                'name': 'population'
            }},
            mode='full-outer',
        ),
    )
    data = flow.results()[0]
    assert data == [[
        {
            'id': 1,
            'city': 'london',
            'population': 8
        },
        {
            'id': 2,
            'city': 'paris',
            'population': 2
        },
        {
            'id': 3,
            'city': 'rome',
            'population': None
        },
        {
            'id': 4,
            'city': None,
            'population': 3
        },
    ]]
Ejemplo n.º 20
0
def test_set_type_regex():
    from dataflows import load, set_type
    flow = Flow(
        load('data/regex.csv'),
        set_type('city', type='string'),
        set_type('temperature (24h)', type='integer', regex=False),
    )
    data = flow.results()[0]
    assert data == [[
        {
            'city': 'london',
            'temperature (24h)': 23
        },
        {
            'city': 'paris',
            'temperature (24h)': 26
        },
        {
            'city': 'rome',
            'temperature (24h)': 21
        },
    ]]
Ejemplo n.º 21
0
def test_deduplicate():
    from dataflows import deduplicate, set_primary_key

    a = [
        {
            'a': 1,
            'b': 3,
            'c': 'First'
        },
        {
            'a': 2,
            'b': 3,
            'c': 'First'
        },
        {
            'a': 1,
            'b': 3,
            'c': '!First'
        },
        {
            'a': 1,
            'b': 2,
            'c': 'First'
        },
        {
            'a': 2,
            'b': 3,
            'c': '!First'
        },
    ]

    f = Flow(
        a,
        set_primary_key(['a', 'b']),
        deduplicate(),
    )
    results, _, _ = f.results()
    assert set(x['c'] for x in results[0]) == {'First'}
Ejemplo n.º 22
0
def test_add_metadata():
    from dataflows import add_metadata
    f = Flow(data, add_metadata(author='Adam Kariv'))
    _, dp, _ = f.results()
    assert dp.descriptor['author'] == 'Adam Kariv'
Ejemplo n.º 23
0
def test_unpivot_any_resources():
    from dataflows import unpivot, validate
    data1 = [
        dict(
            [('name', 'ike{}'.format(i))] +
            [(str(year), year + i) for year in range(1990, 2020, 10)]
        )
        for i in range(5)
    ]
    data2 = [
        dict(
            [('city', 'mike{}'.format(i))] +
            [(str(year), year + i) for year in range(2050, 2080, 10)]
        )
        for i in range(5)
    ]
    f = Flow(
        data1,
        data2,
        unpivot(
            [
                dict(
                    name='([0-9]+)',
                    keys=dict(
                        year='\\1'
                    )
                )
            ],
            [
                dict(
                    name='year',
                    type='integer'
                )
            ],
            dict(
                name='amount',
                type='integer'
            )
        ),
        validate()
    )
    results, _, _ = f.results()
    assert results[0] == [
        dict(zip(['name', 'year', 'amount'], r))
        for r in
        [
            ['ike0', 1990, 1990],
            ['ike0', 2000, 2000],
            ['ike0', 2010, 2010],
            ['ike1', 1990, 1991],
            ['ike1', 2000, 2001],
            ['ike1', 2010, 2011],
            ['ike2', 1990, 1992],
            ['ike2', 2000, 2002],
            ['ike2', 2010, 2012],
            ['ike3', 1990, 1993],
            ['ike3', 2000, 2003],
            ['ike3', 2010, 2013],
            ['ike4', 1990, 1994],
            ['ike4', 2000, 2004],
            ['ike4', 2010, 2014],
        ]
    ]
    assert results[1] == [
        dict(zip(['city', 'year', 'amount'], r))
        for r in
        [
            ['mike0', 2050, 2050],
            ['mike0', 2060, 2060],
            ['mike0', 2070, 2070],
            ['mike1', 2050, 2051],
            ['mike1', 2060, 2061],
            ['mike1', 2070, 2071],
            ['mike2', 2050, 2052],
            ['mike2', 2060, 2062],
            ['mike2', 2070, 2072],
            ['mike3', 2050, 2053],
            ['mike3', 2060, 2063],
            ['mike3', 2070, 2073],
            ['mike4', 2050, 2054],
            ['mike4', 2060, 2064],
            ['mike4', 2070, 2074],
        ]
    ]
Ejemplo n.º 24
0
def test_sort_rows_number():
    from dataflows import sort_rows

    f = Flow(
        [
            {
                'a': 0.1
            },
            {
                'a': -3
            },
            {
                'a': -4
            },
            {
                'a': 10
            },
            {
                'a': 8
            },
            {
                'a': 0
            },
            {
                'a': -1000000
            },
            {
                'a': 1000000
            },
            {
                'a': -0.1
            },
            {
                'a': -0.2
            },
            {
                'a': 0.2
            },
            {
                'a': -1000001
            },
            {
                'a': 1000001
            },
            {
                'a': 6
            },
            {
                'a': -10
            },
            {
                'a': -0.001
            },
            {
                'a': 0.001
            },
            {
                'a': 1
            },
            {
                'a': -1
            },
        ],
        sort_rows(key='{a}'),
    )
    results, _, _ = f.results()
    assert list(results[0]) == [
        {
            'a': -1000001
        },
        {
            'a': -1000000
        },
        {
            'a': -10
        },
        {
            'a': -4
        },
        {
            'a': -3
        },
        {
            'a': -1
        },
        {
            'a': -0.2
        },
        {
            'a': -0.1
        },
        {
            'a': -0.001
        },
        {
            'a': 0
        },
        {
            'a': 0.001
        },
        {
            'a': 0.1
        },
        {
            'a': 0.2
        },
        {
            'a': 1
        },
        {
            'a': 6
        },
        {
            'a': 8
        },
        {
            'a': 10
        },
        {
            'a': 1000000
        },
        {
            'a': 1000001
        },
    ]
Ejemplo n.º 25
0
def test_example_4():
    from dataflows import Flow, set_type

    f = Flow(country_population(),
             set_type('population', type='number', groupChar=','))
    data, dp, _ = f.results()
Ejemplo n.º 26
0
from pprint import pprint
from dataflows import Flow, load, unpivot

# Select unpivoing fields
unpivoting_fields = [{'name': 'treatment_(\w)', 'keys': {'treatment': r'\1'}}]

# A newly created column header would be 'year' with type 'year':
extra_keys = [{'name': 'treatment', 'type': 'string'}]

# And values will be placed in the 'result' column with type 'string':
extra_value = {'name': 'result', 'type': 'string'}

# Run flow
flow = Flow(load('layouts/wide.csv'),
            unpivot(unpivoting_fields, extra_keys, extra_value))
results, package, stats = flow.results()
print('[Data]\n')
pprint(results[0])
print('\n[Meta]\n')
pprint(package.descriptor)
Ejemplo n.º 27
0
def test_example_3():
    from dataflows import Flow, printer

    f = Flow(country_population(), )
    data, *_ = f.results()
Ejemplo n.º 28
0
def test_force_temporal_format():
    import datetime
    from dataflows import load, update_resource, dump_to_path

    # Dump
    Flow(
        load('data/temporal.csv',
             name='temporal',
             override_fields={
                 'datetime': {
                     'type': 'datetime',
                     'outputFormat': '%y|%m|%d %H|%M|%S'
                 },
                 'date': {
                     'outputFormat': '%y|%m|%d'
                 },
                 'time': {
                     'outputFormat': '%H|%M|%S'
                 },
             }),
        dump_to_path('out/force_temporal_format',
                     temporal_format_property='outputFormat')).process()

    # Load
    flow = Flow(load('out/force_temporal_format/datapackage.json'))
    data, package, stats = flow.results()

    # Assert
    assert package.descriptor['resources'][0]['schema'] == {
        'fields': [
            {
                'format': 'default',
                'name': 'event',
                'type': 'string'
            },
            {
                'format': '%y|%m|%d %H|%M|%S',
                'name': 'datetime',
                'type': 'datetime'
            },
            {
                'format': '%y|%m|%d',
                'name': 'date',
                'type': 'date'
            },
            {
                'format': '%H|%M|%S',
                'name': 'time',
                'type': 'time'
            },
        ],
        'missingValues': [''],
    }
    assert data == [[{
        'event': 'start',
        'datetime': datetime.datetime(2015, 1, 2, 15, 30, 45),
        'date': datetime.date(2015, 1, 2),
        'time': datetime.time(15, 30, 45),
    }, {
        'event': 'end',
        'datetime': datetime.datetime(2016, 6, 25, 8, 10, 4),
        'date': datetime.date(2016, 6, 25),
        'time': datetime.time(8, 10, 4),
    }]]
Ejemplo n.º 29
0
def test_load_duplicate_headers():
    from dataflows import load
    flow = Flow(load('data/duplicate_headers.csv'), )
    with pytest.raises(ValueError) as excinfo:
        flow.results()
    assert 'duplicate headers' in str(excinfo.value)
Ejemplo n.º 30
0
def flow(parameters, *_):
    files_dump_to_path = parameters['files_dump_to_path']
    data_dump_to_path = parameters.get('data_dump_to_path')

    def _download_gdrive_data():
        stats = defaultdict(int)
        file_sources = parameters['file_sources']
        folder_id = parameters['google_drive_csv_folder_id']
        files_dir = os.path.join(files_dump_to_path, "files")
        os.makedirs(files_dir, exist_ok=True)
        client = get_client()
        existing_files = {}
        if os.path.exists(os.path.join(files_dump_to_path,
                                       "datapackage.json")):
            for row in Flow(
                    load(os.path.join(files_dump_to_path,
                                      "datapackage.json"))).results()[0][0]:
                existing_files[row["name"]] = row
        for id, name, version in list_files(client, folder_id):
            source = file_sources.get(name)
            if source:
                assert name.endswith(
                    ".csv"), "only csv file sources are supported"
                stats['relevant_source_files'] += 1
                row = {
                    "id":
                    id,
                    "name":
                    name,
                    "version":
                    version,
                    "source":
                    source,
                    "resource_name":
                    "%s__%s" % (source, stats['relevant_source_files'])
                }
                yield row
                if (os.path.exists(
                        os.path.join(files_dump_to_path, "files", name))
                        and name in existing_files
                        and existing_files[name]["id"] == id
                        and existing_files[name]["version"] == version):
                    logging.info("existing file, will not redownload: %s" %
                                 name)
                else:
                    logging.info("downloading file: %s" % name)
                    get_file(client, id,
                             os.path.join(files_dump_to_path, "files", name))
        if stats['relevant_source_files'] != len(file_sources):
            raise Exception("source files mismatch")

    files_flow = Flow(
        _download_gdrive_data(),
        update_resource(-1,
                        name="gdrive_data_files",
                        path="gdrive_data_files.csv",
                        **{"dpp:streaming": True}),
        dump_to_path(files_dump_to_path), printer())
    data_flow_args = []
    for file_row in files_flow.results()[0][0]:
        data_flow_args += [
            load(os.path.join(files_dump_to_path, "files", file_row["name"]),
                 strip=False,
                 infer_strategy=load.INFER_STRINGS,
                 deduplicate_headers=True,
                 cast_strategy=load.CAST_TO_STRINGS,
                 on_error=ignore,
                 limit_rows=parameters.get("limit_rows"),
                 encoding="utf-8"),
            update_resource(-1,
                            name=file_row["resource_name"],
                            path=file_row["name"],
                            **{"dpp:streaming": True})
        ]
    if data_dump_to_path:
        data_flow_args += [dump_to_path(data_dump_to_path)]
    return Flow(*data_flow_args)