def main():
    edges = build_network()['edges']

    dt = DumpTruck(dbname = '/tmp/open-data.sqlite', adapt_and_convert = True)
    datasets_in = dt.execute('SELECT * FROM socrata')

    dt.create_table({'id': 'blah-blah'}, 'socrata_deduplicated')
    dt.create_index(['id'], 'socrata_deduplicated', if_not_exists = True, unique = True)

    for dataset in dedupe(datasets_in, edges):
        dt.upsert(dataset, 'socrata_deduplicated')
Ejemplo n.º 2
0
def test_dedupe():
    datasets1 = [
        {'id': 'a', 'catalog': 'portal1'},
        {'id': 'b', 'catalog': 'portal1'},
        {'id': 'c', 'catalog': 'portal1'}]
    datasets2 = [
        {'id': 'g', 'catalog': 'portal2'},
        {'id': 'h', 'catalog': 'portal2'},
        {'id': 'c', 'catalog': 'portal2'}]
    edges = [('portal1', 'portal2')]
    observed = dedupe(datasets1 + datasets2, edges)

    # Sort by id
    expected = [
        {'id': 'a', 'catalog': 'portal1'},
        {'id': 'b', 'catalog': 'portal1'},
        {'id': 'c', 'catalog': 'portal2'},
        {'id': 'g', 'catalog': 'portal2'},
        {'id': 'h', 'catalog': 'portal2'},
    ]
    observed_list = list(sorted(observed, key = lambda x: x['id']))
    n.assert_list_equal(observed_list, expected)