def main_stats(num_files, file_size, download_iterations, download_threads,
               output_dir, only_upload, only_download, **kwargs):
    download_report_filename, upload_report_filename = None, None
    for filename in glob(os.path.join(output_dir, '*.csv')):
        if '/download-report-' in filename:
            assert not download_report_filename
            download_report_filename = filename
        elif '/upload-report-' in filename:
            assert not upload_report_filename
            upload_report_filename = filename
    assert download_report_filename and upload_report_filename
    print('upload_report_filename', upload_report_filename)
    print('download_report_filename', download_report_filename)
    print("Generating upload stats...")
    upload_stats = defaultdict(int)
    df.Flow(df.load(upload_report_filename),
            stats_process_upload_rows(upload_stats)).process()
    print("Generating download stats...")
    download_stats = defaultdict(int)
    df.Flow(df.load(download_report_filename),
            stats_process_download_rows(download_stats, file_size)).process()
    print("Upload Stats")
    pprint(dict(upload_stats))
    print("Download Stats")
    pprint(dict(download_stats))
def flow(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/unique_records_full/datapackage.json',
             resources=['unique_records']),
        load('data/app_records_full/datapackage.json',
             resources=['search_app_records']),
        add_field('__revision', 'integer', REVISION),
        *(add_field(f['name'], f['type']) for f in STATUS_FIELDS),
        manage_revisions,
        *(dump_to_sql(
            {
                DB_TABLE: {
                    'resource-name': resource_name,
                    'mode': 'update',
                    'update_keys': KEY_FIELDS
                }
            }, DATAFLOWS_DB_ENGINE)
          for resource_name in ['unique_records', 'search_app_records']),
        *(add_field(f'rev_{name}', 'date')
          for name in ['last_updated_at', 'last_modified_at', 'created_at']),
        set_revisions,
        filter_rows(equals=[{
            '__next_update_days': FILTER_NEXT_UPDATE_DAYS
        }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(),
        dump_to_path('data/publications_for_es'),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['doc_id']),
        update_resource(None, **{'dpp:streaming': True}))
Exemple #3
0
def flow(*_):
    DF.Flow(
        DF.load(filename, name='welfare'),
        DF.add_field('activity_name', 'string',
                     lambda r: r['שם השירות (ציבורי)']),
        DF.filter_rows(lambda r: r['activity_name']),
        DF.add_field(
            'activity_description', 'array', lambda r:
            [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)']
             ]),
        DF.add_field(
            'history', 'array', lambda r: [
                dict(
                    year=2019,
                    unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(),
                    subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1].
                    strip(),
                    subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[
                        1].strip(),
                )
            ]), DF.add_field('target_audience', 'array',
                             splitter('אוכלוסייה')),
        DF.add_field('subject', 'array', splitter('תחום ההתערבות')),
        DF.add_field('intervention', 'array', splitter('אופן התערבות')),
        DF.select_fields(FIELDS),
        DF.add_field('publisher_name', 'string', 'משרד הרווחה'),
        DF.add_field('min_year', 'integer', 2019),
        DF.add_field('max_year', 'integer', 2019),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(),
        DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process()
    return DF.Flow(
        DF.load('tmp/activities-welfare/datapackage.json'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )
Exemple #4
0
def join_unique_records(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/search_import_from_gdrive/datapackage.json',
             resources=['search_import']),
        load('data/search_results/unique_records.csv',
             resources=['unique_records']),
        set_type('migdar_id',
                 type='string',
                 resources=['unique_records', 'search_import']),
        join(source_name='search_import',
             source_key=['migdar_id'],
             target_name='unique_records',
             target_key=['migdar_id'],
             fields={
                 f'gd_{field}': {
                     'name': field
                 }
                 for field in SEARCH_IMPORT_FIELD_NAMES
             },
             full=False),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['migdar_id']),
        dump_to_path('data/unique_records_full'),
        update_resource(None, **{'dpp:streaming': True}))
Exemple #5
0
def test_example_8():
    from dataflows import Flow, load, dump_to_path

    def find_double_winners(package):

        # Remove the emmies resource - we're going to consume it now
        package.pkg.remove_resource('emmies')
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)

        # Emmies is the first - read all its data and create a set of winner names
        emmy = next(resources)
        emmy_winners = set(
            map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy)))

        # Oscars are next - filter rows based on the emmy winner set
        academy = next(resources)
        yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners,
                     academy)

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        find_double_winners,
        dump_to_path('out/double_winners'))
    _ = f.process()
Exemple #6
0
def flow():
    load_steps = (load('data/committees/kns_committee/datapackage.json',
                       resources=['kns_committee']),
                  load('data/members/mk_individual/datapackage.json',
                       resources=['mk_individual_positions']),
                  load('data/people/committees/meeting-attendees/datapackage.json',
                       resources=['kns_committeesession']))
    load_steps = (cache(load_steps, cache_path='.cache/web_ui/meetings_load_steps'))
    return Flow(*load_steps + (update_meetings,))
def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()
Exemple #8
0
def conference_csv():
    flow = Flow(
        # Load inputs
        load(
            od19_base + od19_feedback,
            name='feedback',
            format='csv',
        ),
        load(
            od19_base + od19_analysis,
            name='analysis',
            format='csv',
        ),
        # Process them
        set_type("Anzahl.*", type='integer', resources='analysis'),
        delete_fields([
            "Anzahl Auflistung",
            ".*\\(Formel\\)",
            ".*Duplikate",
        ],
                      resources='analysis'),
        not_empty_groupcol,
        # Save the results
        add_metadata(
            name='opendatach19',
            title='''Opendata.ch/2019 Forum''',
            licenses=[{
                "name":
                "ODC-PDDL-1.0",
                "path":
                "http://opendatacommons.org/licenses/pddl/",
                "title":
                "Open Data Commons Public Domain Dedication and License v1.0"
            }],
            maintainers=[{
                "name": "Oleg Lavrovsky",
                "web": "https://datalets.ch/"
            }],
            views=[{
                "name": "Groups",
                "resources": ["analysis"],
                "spec": {
                    "group": "Alle "
                    "Bedürfnisse"
                    "",
                    "series": ["Anzahl Auflistung (Zahl)"],
                    "type": "bar"
                },
                "specType": "simple",
                "title": "Topic counts"
            }]),
        printer(),
        validate(),
        dump_to_path('data/opendatach19'),
    )
    flow.process()
Exemple #9
0
def test_expected_contact_with_patient():
    print("test_expected_contact_with_patient")
    back_from_abroad_db = [169603, 169632, 169813]
    contact_with_patient_db = [10722, 10715, 10697]
    Flow(
        load_from_db.flow({
            "where":
            "id in (%s)" %
            ", ".join(map(str, back_from_abroad_db + contact_with_patient_db))
        }),
        add_gps_coordinates.flow({
            "source_fields":
            get_parameters_from_pipeline_spec(
                "pipeline-spec.yaml", "corona_data_collector",
                "corona_data_collector.add_gps_coordinates")["source_fields"],
            "get-coords-callback":
            lambda street, city: (random.uniform(29, 34), random.uniform(
                34, 36), int(street != city))
        }),
        export_corona_bot_answers.flow({
            "destination_output":
            "data/corona_data_collector/destination_output"
        }),
    ).process()
    contact_with_patient_key = values_to_convert['insulation_status'][
        'contact-with-patient']
    back_from_abroad_key = values_to_convert['insulation_status'][
        'back-from-abroad']
    contact_with_patient_array = []
    back_from_abroad_array = []
    counts = {"contact_with_patient": 0, "back_from_abroad": 0}

    def _test(row):
        if int(row["isolation"]) == contact_with_patient_key:
            counts["contact_with_patient"] += 1
            contact_with_patient_array.append(int(row["id"]))
        if int(row["isolation"]) == back_from_abroad_key:
            assert int(row["id"]) in back_from_abroad_db
            counts["back_from_abroad"] += 1
            back_from_abroad_array.append(int(row["id"]))

    Flow(
        load(
            'data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv'
        ),
        load(
            'data/corona_data_collector/destination_output/corona_bot_answers_22_3_2020_with_coords.csv'
        ),
        _test,
    ).process()
    assert 3 == counts["contact_with_patient"], str(counts)
    assert 3 == counts["back_from_abroad"], str(counts)
    assert set(back_from_abroad_array) == set(back_from_abroad_db)
    assert set(contact_with_patient_array) == set(contact_with_patient_db)
    print("OK")
Exemple #10
0
def update_dataset():
    flow = Flow(
        # Load inputs
        load(f'{BASE_URL}{CONFIRMED}'),
        load(f'{BASE_URL}{RECOVERED}'),
        load(f'{BASE_URL}{DEATH}'),
        checkpoint('load_data'),
        # Process them (if necessary)
        # Save the results
        add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''),
        printer(),
        dump_to_path(),
    )
    flow.process()
Exemple #11
0
def test_exception_information_multiple_processors_last_errored():
    from dataflows import Flow, load, exceptions
    flow = Flow(
        load('data/academy.csv'),
        load('data/bad-path2.csv'),
    )
    with pytest.raises(exceptions.ProcessorError) as excinfo:
        flow.results()
    assert str(excinfo.value.cause).startswith(
        "Failed to load source 'data/bad-path2.csv' and options")
    assert str(excinfo.value.cause).endswith(
        ": [Errno 2] No such file or directory: 'data/bad-path2.csv'")
    assert excinfo.value.processor_name == 'load'
    assert excinfo.value.processor_object.load_source == 'data/bad-path2.csv'
    assert excinfo.value.processor_position == 2
Exemple #12
0
def test_load_name_path():
    from dataflows import load

    dp, *_ = Flow(load('data/beatles_age.json', name='foo'),
                  load('data/beatles_age.csv')).process()

    print(dp.descriptor['resources'])

    res0 = dp.resources[0]
    res1 = dp.resources[1]

    assert res0.name == 'foo'
    assert res0.descriptor['path'] == 'foo.json'
    assert res1.name == 'beatles_age'
    assert res1.descriptor['path'] == 'beatles_age.csv'
Exemple #13
0
def test_change_acl_on_s3_no_path_provided(s3_client, bucket):

    # Prepare paths
    paths = [
        'my/private/datasets/file_1.csv'
        'my/private/datasets/file_2.csv'
    ]

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert everything is private now
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 403
def get_updated_sources():
    import requests
    from pyquery import PyQuery as pq
    URL = 'https://mr.gov.il/ilgstorefront/he/news/details/230920201036'
    sources = []

    page = pq(requests.get(URL).text)
    anchors = page.find('a')
    for anchor in anchors:
        anchor = pq(anchor)
        href = anchor.attr('href')
        if '.zip' in href:
            sources.append(href + '#.xlsx')
    sources = [
        DF.load(source,
                format='excel-xml',
                encoding='utf8',
                bytes_sample_size=0) for source in sources
    ]
    if len(sources) != 2:
        return DF.Flow(
            data_gov_il_resource.flow(tenders),
            data_gov_il_resource.flow(exemptions),
        )
    else:
        return DF.Flow(*sources)
Exemple #15
0
def test_change_acl_on_s3_handles_more_than_1000_files(s3_client, bucket):

    # Prepare paths
    paths = []
    for index in range(1, 1101):
        path = 'my/private/datasets/file_%s.csv' % index
        paths.append(path)

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            path='my/private/datasets',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert everything is private now
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 403
Exemple #16
0
def flow(parameters):
    _from = parameters.pop('from')

    num_resources = 0

    def count_resources():
        def func(package):
            global num_resources
            num_resources = len(package.pkg.resources)
            yield package.pkg
            yield from package
        return func

    def mark_streaming(_from):
        def func(package):
            for i in range(num_resources, len(package.pkg.resources)):
                package.pkg.descriptor['resources'][i].setdefault(PROP_STREAMING, True)
                package.pkg.descriptor['resources'][i].setdefault(PROP_STREAMED_FROM,  _from)
            yield package.pkg
            yield from package
        return func

    return Flow(
        count_resources(),
        load(_from, **parameters),
        mark_streaming(_from),
    )
Exemple #17
0
def test_example_7():
    from dataflows import Flow, load, dump_to_path

    def add_is_guitarist_column(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(
            dict(name='is_guitarist', type='boolean'))
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)
        beatles = next(resources)

        def f(row):
            row['is_guitarist'] = row['instrument'] == 'guitar'
            return row

        yield map(f, beatles)

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists'))
    _ = f.process()
def prepare_locations():
    prepare_addresses()
    return DF.Flow(
        DF.load('_cache_addresses/datapackage.json'),
        DF.add_field(
            'address', 'string', lambda r: '{} {}{}'.format(
                r['street_name'], r['house_number'], r['letter'] or '')),
        DF.add_field(
            'item', 'object',
            lambda r: dict(value=dict(lat=float(r['lat']),
                                      lon=float(r['lon']),
                                      arnona_zones=r['arnona_zones'],
                                      שם=r['address']),
                           display=r['address'])),
        DF.sort_rows('{house_number}'),
        DF.delete_fields([
            'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address'
        ]),
        DF.join_with_self(
            'concat', ['street_name'],
            dict(display=dict(name='street_name'),
                 items=dict(name='item', aggregate='array'))),
        DF.add_field('sort_street_address', 'string',
                     lambda r: sort_street_address(r['display'])),
        DF.sort_rows('{sort_street_address}'),
        DF.delete_fields(['sort_street_address']), DF.printer(),
        DF.dump_to_path('_cache_locations'),
        DF.checkpoint('_cache_locations')).results()[0][0]
def broken_links_flow():
    return DF.Flow(
        *[
            DF.Flow(
                DF.load(URL_TEMPLATE.format(**c), name=c['name']),
                DF.add_field('__name',
                             'string',
                             c['name'],
                             resources=c['name']),
                DF.add_field('__title',
                             'string',
                             get_title(c['title']),
                             resources=c['name']),
            ) for c in configuration
        ],
        DF.add_field('urls', 'array', lambda r: RE.findall(str(r))),
        DF.add_field('link', 'string',
                     lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)),
        DF.concatenate(
            dict(
                name=['__name'],
                title=['__title'],
                link=[],
                urls=[],
            )),
        DF.add_field('url', 'string'),
        DF.add_field('error', 'string'),
        unwind(),
        DF.delete_fields(['urls']),
        DF.parallelize(check_broken(), 4),
        DF.filter_rows(lambda r: r['error'] is not None),
    )
Exemple #20
0
def datarecords(kind):
    return map(
        lambda r: r['value'],
        DF.Flow(
            DF.load(f'https://data-input.obudget.org/api/datarecords/{kind}',
                    format='json',
                    property='result')).results()[0][0])
Exemple #21
0
def flow(*args):
    is_dpp = len(args) > 3
    return Flow(
        load(
            'https://migdar-internal-search.odata.org.il/__data/search_import/index.csv',
            encoding='utf-8',
            http_session=get_migdar_session()),
        update_resource('index',
                        name='search_import_index',
                        path='search_import_index.csv'),
        load_from_gdrive_files,
        update_resource('search_import_index',
                        name='search_import',
                        path='search_import.csv',
                        schema={
                            'fields': [{
                                'name': n,
                                'type': 'string'
                            } for n in SEARCH_IMPORT_FIELD_NAMES]
                        },
                        **{'dpp:streaming': True}),
        printer(num_rows=20,
                tablefmt='plain' if is_dpp else 'html',
                fields=['migdar_id', 'pubyear', 'title']),
        dump_to_path('data/search_import_from_gdrive'))
Exemple #22
0
def operator(name, params):
    connection_string = params['db_url']
    source_table = params['db_table']
    target_instance_name = params['target_instance_name']
    target_package_id = params['target_package_id']
    target_organization_id = params['target_organization_id']

    print('starting db_fetcher operator')
    print(
        'source_table={} target_instance_name={} target_package_id={} target_organization_id={}'
        .format(source_table, target_instance_name, target_package_id,
                target_organization_id))
    with tempfile.TemporaryDirectory() as tempdir:
        csv_filename = target_package_id + '.csv'
        DF.Flow(
            DF.load(connection_string,
                    table=source_table,
                    name=target_package_id,
                    infer_strategy=DF.load.INFER_PYTHON_TYPES),
            DF.update_resource(-1, path=csv_filename),
            DF.delete_fields(['_source']), DF.dump_to_path(tempdir)).process()
        csv_filename = os.path.join(tempdir, csv_filename)
        print('{}, {:,} bytes'.format(csv_filename,
                                      os.stat(csv_filename).st_size))
        update_package(target_instance_name, target_organization_id,
                       target_package_id, target_package_id,
                       [('CSV', csv_filename)])
Exemple #23
0
def test_change_acl_on_s3(s3_client, bucket):

    # Prepare paths
    paths = [
        'my/private/datasets/README.md',
        'my/private/datasets/datapackage.json',
        'my/private/datasets/data/mydata.csv',
        'my/public/datasets/data/mydata.csv',
    ]

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Assert all contents are public by default
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 200

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            path='my/private/datasets',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert only public contents are public
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == (200 if 'public' in path else 403)
    def _get_last_runs():
        runs_history_last_rows = {}
        for id, path in parameters["check_covid19_israel_id_paths"].items():

            def _process_runs_history(rows):
                for row in rows:
                    yield row
                    runs_history_last_rows[id] = row

            Flow(load("%s/runs_history/datapackage.json" % path),
                 _process_runs_history).process()
        for id, row in runs_history_last_rows.items():
            start_time = row["start_time"]
            end_time = datetime.datetime.strptime(row["end_time"],
                                                  '%Y-%m-%dT%H:%M:%S')
            yield {
                "id":
                id,
                "github_sha1":
                row["github_sha1"],
                "error":
                row["error"],
                "start_time":
                start_time,
                "end_time":
                end_time,
                "duration_minutes":
                (end_time - start_time).total_seconds() / 60,
                "log_file":
                "https://avidcovider-pipelines-data.odata.org.il/data/%s/log_files/%s.log"
                % (id, start_time.strftime("%Y%m%dT%H%M%S")),
            }
Exemple #25
0
def test_dump_to_s3_non_existent_bucket(s3_client, bucket):

    # Delete bucket
    s3_client.delete_bucket(Bucket=bucket)

    # Dump to S3 using the processor
    flow = Flow(
        load('data/data.csv'),
        dump_to_s3(
            bucket=bucket,
            acl='private',
            path='my/datapackage',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Check datapackage.json content
    response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/datapackage.json')
    descriptor = json.loads(response['Body'].read().decode('utf-8'))
    assert descriptor['resources'][0]['schema']['fields'][0]['name'] == 'id'
    assert descriptor['resources'][0]['schema']['fields'][1]['name'] == 'name'

    # Check data.csv content
    response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/data.csv')
    contents = response['Body'].read().decode('utf-8')
    assert contents == 'id,name\r\n1,english\r\n2,中国人\r\n'
Exemple #26
0
def test_load_strategy_infer_strings_from_native_types():
    from dataflows import load

    flow = Flow(load(
        'data/beatles_age.json',
        infer_strategy='strings',
    ), )
    data, package, stats = flow.results()
    assert data == [[
        {
            'age': '18',
            'name': 'john'
        },
        {
            'age': '16',
            'name': 'paul'
        },
        {
            'age': '17',
            'name': 'george'
        },
        {
            'age': '22',
            'name': 'ringo'
        },
    ]]
Exemple #27
0
def test_load_from_package_resources():
    from dataflows import load

    datapackage = {
        'resources': [{
            'name': 'my-resource-{}'.format(i),
            'path': 'my-resource-{}.csv'.format(i),
            'schema': {
                'fields': [{
                    'name': 'foo',
                    'type': 'string'
                }]
            }
        } for i in range(2)]
    }
    resources = ((row for row in [{
        'foo': 'bar{}'.format(i)
    }, {
        'foo': 'baz{}'.format(i)
    }]) for i in range(2))

    data, dp, *_ = Flow(
        load((datapackage, resources),
             resources=['my-resource-1']), ).results()

    assert len(dp.resources) == 1
    assert dp.get_resource(
        'my-resource-1').descriptor['path'] == 'my-resource-1.csv'
    assert data[0][1] == {'foo': 'baz1'}
Exemple #28
0
def test_sort_rows_decimal():
    from decimal import Decimal
    from dataflows import sort_rows, load

    f = Flow(
        load('data/numbers.csv', cast_strategy=load.CAST_WITH_SCHEMA),
        sort_rows(key='{a}'),
    )
    results, dp, _ = f.results()
    assert list(results[0]) == [{
        'a': Decimal('-1000')
    }, {
        'a': Decimal('-0.5')
    }, {
        'a': Decimal('-0.4')
    }, {
        'a': Decimal('0')
    }, {
        'a': Decimal('1.1')
    }, {
        'a': Decimal('2')
    }, {
        'a': Decimal('10')
    }, {
        'a': Decimal('1000')
    }]
Exemple #29
0
def test_load_duplicate_headers_with_deduplicate_headers_flag():
    from dataflows import load
    flow = Flow(load('data/duplicate_headers.csv', deduplicate_headers=True), )
    data, package, stats = flow.results()
    assert package.descriptor['resources'][0]['schema']['fields'] == [
        {
            'name': 'header1',
            'type': 'string',
            'format': 'default'
        },
        {
            'name': 'header2 (1)',
            'type': 'string',
            'format': 'default'
        },
        {
            'name': 'header2 (2)',
            'type': 'string',
            'format': 'default'
        },
    ]
    assert data == [[
        {
            'header1': 'value1',
            'header2 (1)': 'value2',
            'header2 (2)': 'value3'
        },
    ]]
Exemple #30
0
def generate_package():
    package_flow = Flow(
        add_metadata(
            name="unicode-emojis",
            title="UTS #51 Unicode Emoji",
            descriptor=(
                "List of emojis available from the Unicode Consortium. "
                "More information can be found in the Unicode® Technical Standard #51."
            ),
            sources=[
                {
                    "name": "unicode-emoji",
                    "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt",
                    "title": "UTS #51 Unicode Emoji",
                },
            ],
            licenses=[
                {
                    "name": "ODC-PDDL-1.0",
                    "path": "http://opendatacommons.org/licenses/pddl/",
                    "title": "Open Data Commons Public Domain Dedication and License v1.0",
                }
            ],
            keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"],
        ),
        load(load_source="data/emojis.csv", format="csv",),
        validate(),
        dump_to_path(),
    )
    package_flow.process()
         }
     ],
     views=[
         {
           "name": "graph",
           "title": "Average yield from British Government Securities, 10 year Nominal Par Yield",
           "specType": "simple",
           "spec": {"type": "line","group": "Date","series": ["Rate"]}
         }
     ],
     readme=readme()
 ),
 load(
     load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUQAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
     skip_rows=[1],
     headers=['Date', 'Rate'],
     format='csv',
     name='quarterly'
 ),
 load(
     load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUAAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
     skip_rows=[1],
     headers=['Year', 'Rate'],
     format='csv',
     name='annual'
 ),
 set_type('Date', resources='quarterly', type='date', format='any'),
 set_type('Rate', resources='quarterly', type='number', description='Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'),
 set_type('Year', resources='annual', type='date', format='any'),
 set_type('Rate', resources='annual', type='number', description='Annual average yield from British Government Securities, 10 year Nominal Par Yield'),
 update_resource('quarterly', **{'path':'data/quarterly.csv', 'dpp:streaming': True}),
            }
        ],
        version="0.2.0",
        views=[
            {
              "name": "graph",
              "title": "VIX - CBOE Volatility Index",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["VIX Close"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.cboe.com/publish/ScheduledTask/MktData/datahouse/vixcurrent.csv',
        headers=2,
        name='vix-daily'
    ),
    set_type('Date', type='date', format='any'),
    update_resource('vix-daily', **{'title': 'VIX Daily', 'path':'data/vix-daily.csv', 'dpp:streaming': True}),
    validate()
)


def flow(parameters, datapackage, resources, stats):
    return finance_vix


if __name__ == '__main__':
    finance_vix.process()
             "publisher": "core",
             "formats": ["CSV", "JSON"]
         },
         {
             "title": "Natural gas",
             "path": "/core/natural-gas",
             "publisher": "core",
             "formats": ["CSV", "JSON"]
         }
     ],
     version="0.2.0"
 ),
 load(
     load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its',
     skip_rows=[1, 2, 3, 4, 5, -1],
     headers=['Date', 'Price', 'Empty column'],
     format='csv',
     name='annual'
 ),
 extract_december_rows,
 load(
     load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its',
     skip_rows=[1, 2, 3, 4, 5, -1],
     headers=['Date', 'Price', 'Empty column'],
     format='csv',
     name='monthly'
 ),
 update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
 update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
 set_type('Date', resources='annual', type='yearmonth'),
 set_type('Price', resources='annual', type='number'),
            }
        ],
        views=[
            {
              "name": "graph",
              "title": "10 year US Government Bond Yields (Monthly granuarlity)",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["Rate"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn',
        skip_rows=[i+1 for i in range(6)],
        headers=['Date', 'Rate'],
        format='csv',
        name='monthly'
    ),
    set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'),
    set_type('Rate', type='number', description='Percent per year'),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)


def flow(parameters, datapackage, resources, stats):
    return bond_us