Beispiel #1
0
def flow(*args):
    is_dpp = len(args) > 3
    return Flow(
        load(
            'https://migdar-internal-search.odata.org.il/__data/search_import/index.csv',
            encoding='utf-8',
            http_session=get_migdar_session()),
        update_resource('index',
                        name='search_import_index',
                        path='search_import_index.csv'),
        load_from_gdrive_files,
        update_resource('search_import_index',
                        name='search_import',
                        path='search_import.csv',
                        schema={
                            'fields': [{
                                'name': n,
                                'type': 'string'
                            } for n in SEARCH_IMPORT_FIELD_NAMES]
                        },
                        **{'dpp:streaming': True}),
        printer(num_rows=20,
                tablefmt='plain' if is_dpp else 'html',
                fields=['migdar_id', 'pubyear', 'title']),
        dump_to_path('data/search_import_from_gdrive'))
Beispiel #2
0
def join_unique_records(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/search_import_from_gdrive/datapackage.json',
             resources=['search_import']),
        load('data/search_results/unique_records.csv',
             resources=['unique_records']),
        set_type('migdar_id',
                 type='string',
                 resources=['unique_records', 'search_import']),
        join(source_name='search_import',
             source_key=['migdar_id'],
             target_name='unique_records',
             target_key=['migdar_id'],
             fields={
                 f'gd_{field}': {
                     'name': field
                 }
                 for field in SEARCH_IMPORT_FIELD_NAMES
             },
             full=False),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['migdar_id']),
        dump_to_path('data/unique_records_full'),
        update_resource(None, **{'dpp:streaming': True}))
def flow(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/unique_records_full/datapackage.json',
             resources=['unique_records']),
        load('data/app_records_full/datapackage.json',
             resources=['search_app_records']),
        add_field('__revision', 'integer', REVISION),
        *(add_field(f['name'], f['type']) for f in STATUS_FIELDS),
        manage_revisions,
        *(dump_to_sql(
            {
                DB_TABLE: {
                    'resource-name': resource_name,
                    'mode': 'update',
                    'update_keys': KEY_FIELDS
                }
            }, DATAFLOWS_DB_ENGINE)
          for resource_name in ['unique_records', 'search_app_records']),
        *(add_field(f'rev_{name}', 'date')
          for name in ['last_updated_at', 'last_modified_at', 'created_at']),
        set_revisions,
        filter_rows(equals=[{
            '__next_update_days': FILTER_NEXT_UPDATE_DAYS
        }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(),
        dump_to_path('data/publications_for_es'),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['doc_id']),
        update_resource(None, **{'dpp:streaming': True}))
Beispiel #4
0
def judges_flow(out_path):
    return Flow(
        get_tribunals(),
        update_resource(['res_1'], name='tribunals', path='tribunals.csv'),
        checkpoint('judges_tribunals'), get_judges(),
        update_resource(['res_2'], name='judges_list', path='judges_list.csv'),
        set_type('Is_In_Dimus_List', resources=['judges_list'],
                 type='boolean'), checkpoint('judges_judges_list'),
        join('tribunals', ['Tribunal_Code'],
             'judges_list', ['Tribunal_Code'],
             fields={
                 'Tribunal_Type_Code': {},
                 'Tribunal_Arkaa_Code': {
                     'name': 'Arkaa_Code'
                 },
                 'Tribunal_District_Code': {
                     'name': 'District_Code'
                 },
                 'Tribunal_Name': {
                     'name': 'Name'
                 }
             }), fetch_judges_details, checkpoint('judges_details'),
        add_field('tribunal_type_name', 'string'), parse_judges_extra_details,
        checkpoint('judges_extra_details'), parse_judge_events,
        dump_to_path(out_path), printer(num_rows=1))
Beispiel #5
0
def flow(parameters, *_):
    logging.info('Pulling latest code from COVID19-ISRAEL github repo')
    logging.info('COVID19_ISRAEL_REPOSITORY=%s' %
                 os.environ.get('COVID19_ISRAEL_REPOSITORY'))
    logging.info('pulling from origin/master')
    utils.subprocess_call_log(
        ['git', 'config', 'user.email', 'avid-covider-pipelines@localhost'],
        cwd='../COVID19-ISRAEL')
    utils.subprocess_call_log(
        ['git', 'config', 'user.name', 'avid-covider-pipelines'],
        cwd='../COVID19-ISRAEL')
    if utils.subprocess_call_log(['git', 'pull', 'origin', 'master'],
                                 cwd='../COVID19-ISRAEL') != 0:
        raise Exception('Failed to git pull')
    sha1 = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
                                   cwd='../COVID19-ISRAEL').decode().strip()
    return Flow(
        iter([{
            'sha1': sha1
        }]),
        update_resource(-1,
                        name='github_pull_covid19_israel',
                        path='github_pull_covid19_israel.csv',
                        **{'dpp:streaming': True}), printer(),
        dump_to_path(
            parameters.get('dump_to_path', 'data/github_pull_covid19_israel')))
def flow(*_):
    gcd = google_chrome_driver()
    download = gcd.download(
        'https://data.gov.il/dataset/246d949c-a253-4811-8a11-41a137d3d613/resource/f004176c-b85f-4542-8901-7b3176f9a054/download/f004176c-b85f-4542-8901-7b3176f9a054.csv'
    )
    return Flow(
        load(download, cast_strategy=load.CAST_TO_STRINGS),
        concatenate(_get_columns_mapping_dict(),
                    target=dict(name='company-details')),
        set_type('id', type='string'),
        set_type('company_registration_date', type='date', format='%d/%m/%Y'),
        set_type('company_is_government',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['כן']),
        set_type('company_is_mafera',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['מפרה', 'התראה']),
        set_type('company_last_report_year', type='integer'),
        clear_bool_values,
        update_resource(**{'dpp:streaming': True},
                        resources='company-details'),
        set_primary_key(['id'], resources='company-details'),
        printer(),
    )
def prepare_locations():
    prepare_addresses()
    return DF.Flow(
        DF.load('_cache_addresses/datapackage.json'),
        DF.add_field(
            'address', 'string', lambda r: '{} {}{}'.format(
                r['street_name'], r['house_number'], r['letter'] or '')),
        DF.add_field(
            'item', 'object',
            lambda r: dict(value=dict(lat=float(r['lat']),
                                      lon=float(r['lon']),
                                      arnona_zones=r['arnona_zones'],
                                      שם=r['address']),
                           display=r['address'])),
        DF.sort_rows('{house_number}'),
        DF.delete_fields([
            'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address'
        ]),
        DF.join_with_self(
            'concat', ['street_name'],
            dict(display=dict(name='street_name'),
                 items=dict(name='item', aggregate='array'))),
        DF.add_field('sort_street_address', 'string',
                     lambda r: sort_street_address(r['display'])),
        DF.sort_rows('{sort_street_address}'),
        DF.delete_fields(['sort_street_address']), DF.printer(),
        DF.dump_to_path('_cache_locations'),
        DF.checkpoint('_cache_locations')).results()[0][0]
Beispiel #8
0
def flow(*_):
    DF.Flow(
        DF.load(filename, name='welfare'),
        DF.add_field('activity_name', 'string',
                     lambda r: r['שם השירות (ציבורי)']),
        DF.filter_rows(lambda r: r['activity_name']),
        DF.add_field(
            'activity_description', 'array', lambda r:
            [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)']
             ]),
        DF.add_field(
            'history', 'array', lambda r: [
                dict(
                    year=2019,
                    unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(),
                    subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1].
                    strip(),
                    subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[
                        1].strip(),
                )
            ]), DF.add_field('target_audience', 'array',
                             splitter('אוכלוסייה')),
        DF.add_field('subject', 'array', splitter('תחום ההתערבות')),
        DF.add_field('intervention', 'array', splitter('אופן התערבות')),
        DF.select_fields(FIELDS),
        DF.add_field('publisher_name', 'string', 'משרד הרווחה'),
        DF.add_field('min_year', 'integer', 2019),
        DF.add_field('max_year', 'integer', 2019),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(),
        DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process()
    return DF.Flow(
        DF.load('tmp/activities-welfare/datapackage.json'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )
Beispiel #9
0
def flow(parameters, *_):
    logging.info('Running COVID19-ISRAEL module %s' % parameters['module'])
    mtimes = {}
    sizes = {}
    hashes = {}
    for path in glob('../COVID19-ISRAEL/**', recursive=True):
        if os.path.isfile(path):
            mtimes[path] = os.path.getmtime(path)
            sizes[path] = os.path.getsize(path)
            hashes[path] = get_hash(path)
    if utils.subprocess_call_log(['python', '-u', '-m', parameters['module']],
                                 log_file=parameters.get('log_file'),
                                 cwd='../COVID19-ISRAEL') != 0:
        raise Exception('Failed to run COVID19-ISRAEL module %s' %
                        parameters['module'])
    resource_name = parameters.get('resource_name',
                                   'covid19_israel_updated_files')
    dump_to_path_name = parameters.get(
        'dump_to_path',
        'data/run_covid19_israel/last_updated_files/%s' % parameters['module'])
    printer_num_rows = parameters.get('printer_num_rows', 999)
    return Flow(
        get_updated_files(mtimes, sizes, hashes),
        update_resource(-1,
                        name=resource_name,
                        path='%s.csv' % resource_name,
                        **{'dpp:streaming': True}),
        *([printer(
            num_rows=printer_num_rows)] if printer_num_rows > 0 else []),
        *([dump_to_path(dump_to_path_name)] if dump_to_path_name else []))
Beispiel #10
0
def es_dumper(resource_name, revision, path):
    now = time.time()
    return DF.Flow(
        update_pk('doc_id'),
        DF.add_field('revision', 'integer', default=revision),
        DF.add_field('score', 'number', default=1),
        DF.add_field('create_timestamp', 'number', now),
        my_dump_to_es(indexes={
            'migdar__' + resource_name: [{
                'resource-name': resource_name,
                'revision': revision
            }]
        },
                      mapper_cls=BoostingMappingGenerator,
                      index_settings={'index.mapping.coerce': True},
                      elasticsearch_options=dict(timeout=60)),
        DF.dump_to_path('data/{}'.format(path)),
        collate(revision),
        my_dump_to_es(indexes={
            'migdar__docs': [{
                'resource-name': resource_name,
                'revision': revision
            }]
        },
                      mapper_cls=BoostingMappingGenerator,
                      index_settings={'index.mapping.coerce': True}),
        DF.update_resource(None, **{'dpp:streaming': True}),
        DF.printer(),
    )
def flow(*_):
    print('reading companies...')
    return Flow(
        data_gov_il_resource.flow(companies),
        fix_values(),
        concatenate(_get_columns_mapping_dict(),
                    target=dict(name='company-details')),
        set_type('id', type='string'),
        set_type('company_street_number', type='string'),
        set_type('company_registration_date', type='date', format='%d/%m/%Y'),
        set_type('company_is_government',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['כן']),
        set_type('company_is_mafera',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['מפרה', 'התראה']),
        set_type('company_last_report_year', type='integer'),
        set_type('company_postal_code', type='string'),
        clear_bool_values,
        update_resource(**{'dpp:streaming': True},
                        resources='company-details'),
        set_primary_key(['id'], resources='company-details'),
        printer(),
    )
Beispiel #12
0
def dump_print_flow(flow,
                    dump_path,
                    num_rows=1,
                    fields=None,
                    checkpoint_name=None):
    return Flow(flow,
                checkpoint(checkpoint_name) if checkpoint_name else None,
                dump_to_path(dump_path),
                printer(num_rows=num_rows, fields=fields))
Beispiel #13
0
def test_rename_resource2():
    from dataflows import Flow, printer, update_resource

    f = Flow(({
        'a': x
    } for x in range(10)), update_resource(None, name='renamed'), printer())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert dp.descriptor['resources'][0]['name'] == 'renamed'
Beispiel #14
0
def list_instances():
    os.makedirs('data/list_instances', exist_ok=True)
    data = []
    Flow((get_instance_row(instance)
          for instance in ckan_instance_manager.list_instances(full=True)),
         dump_to_json(data), dump_to_path('data/list_instances'),
         printer(num_rows=99999)).process()
    with open('data/list_instances.json', 'w') as f:
        json.dump(data, f)
Beispiel #15
0
def conference_csv():
    flow = Flow(
        # Load inputs
        load(
            od19_base + od19_feedback,
            name='feedback',
            format='csv',
        ),
        load(
            od19_base + od19_analysis,
            name='analysis',
            format='csv',
        ),
        # Process them
        set_type("Anzahl.*", type='integer', resources='analysis'),
        delete_fields([
            "Anzahl Auflistung",
            ".*\\(Formel\\)",
            ".*Duplikate",
        ],
                      resources='analysis'),
        not_empty_groupcol,
        # Save the results
        add_metadata(
            name='opendatach19',
            title='''Opendata.ch/2019 Forum''',
            licenses=[{
                "name":
                "ODC-PDDL-1.0",
                "path":
                "http://opendatacommons.org/licenses/pddl/",
                "title":
                "Open Data Commons Public Domain Dedication and License v1.0"
            }],
            maintainers=[{
                "name": "Oleg Lavrovsky",
                "web": "https://datalets.ch/"
            }],
            views=[{
                "name": "Groups",
                "resources": ["analysis"],
                "spec": {
                    "group": "Alle "
                    "Bedürfnisse"
                    "",
                    "series": ["Anzahl Auflistung (Zahl)"],
                    "type": "bar"
                },
                "specType": "simple",
                "title": "Topic counts"
            }]),
        printer(),
        validate(),
        dump_to_path('data/opendatach19'),
    )
    flow.process()
Beispiel #16
0
def operator(name, params, pipeline):
    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file:
        params['dgpConfig'].setdefault('publish', {})['allowed'] = True
        metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {})
        metadata['title'] = name
        metadata['dag_id'] = pipeline['id']
        metadata['updated_at'] = pipeline['__updated_at']
        metadata['created_at'] = pipeline['__created_at']
        for k, v in params.items():
            if k.startswith('extra.'):
                set_dots(params['dgpConfig'], k, v)
        logging.info('\nCONFIGURATION:\n--------------\n%s', 
                     json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2))
        yaml.dump(params['dgpConfig'], config_file)
        config_file.flush()
        config = Config(config_file.name)
        taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
        context = Context(config, taxonomy_registry)

        logging.getLogger().setLevel(logging.INFO)

        steps = [
            FileLoaderDGP,
            LoaderDGP,
            PostLoaderDGP,
            TransformDGP,
            EnricherDGP,
            PublisherDGP,
        ]

        dgp = SimpleDGP(
            config, context,
            steps=steps
        )

        ret = dgp.analyze()
        if not ret:
            logging.error('Errors:')
            logging.error('\n\t - '.join([str(x) for x in dgp.errors]))
            assert False

        # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', 
        #              json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2))

        logging.info('Creating Flow')
        flow = dgp.flow()
        flow = Flow(
            flow,
            printer(tablefmt='html')
        )
        logging.info('Running Flow')
        _, stats = flow.process()

        logging.info('Success')

        return stats
def flow(*_):
    return DF.Flow(
        [
            dict(office=office, kind=kind) for office in offices
            for kind in report_kinds
        ],
        do_query(),
        DF.printer(),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )
def main(request_times_api_url):
    metadata = {}
    stats = collections.defaultdict(int)
    instance_stats = collections.defaultdict(int)
    Flow(get_builds(request_times_api_url, stats),
         aggregate_instance_stats(instance_stats, metadata),
         dump_to_path('data/aggregate_request_times')).process()
    Flow(get_instance_stats_data(instance_stats, metadata),
         dump_to_path('data/aggregate_request_times_stats'),
         printer(num_rows=1)).process()
Beispiel #19
0
def main(instance_ids_or_names, approve_code):
    instance_ids_or_names = [
        i.strip() for i in instance_ids_or_names.split(',') if i.strip()
    ]
    approve_code = approve_code.strip()
    logs.info(instance_ids_or_names=instance_ids_or_names,
              approve_code=approve_code)
    Flow(delete_instances(instance_ids_or_names, approve_code),
         dump_to_path('data/delete_instances'),
         printer(num_rows=9999)).process()
Beispiel #20
0
def test_exception_in_generator():
    from dataflows import Flow, printer, exceptions

    def generator():
        for i in range(5):
            raise MyException()
            yield {"i": i}

    with pytest.raises(exceptions.ProcessorError) as excinfo:
        Flow(generator(), printer()).process()
    assert isinstance(excinfo.value.cause, MyException)
Beispiel #21
0
def test_update_schema():
    from dataflows import Flow, printer, update_schema, validate

    f = Flow([['a', '-'], ['a', 0]], update_schema(-1, missingValues=['-']),
             validate(), printer())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert results[0] == [
        dict(col0='a', col1=None),
        dict(col0='a', col1=0),
    ]
Beispiel #22
0
def AFRR_Data():
    unpivoting_fields = [{
        'name': 'aFRR_DownActivated',
        'keys': {
            'product': 'aFRR_DownActivated'
        }
    }, {
        'name': 'aFRR_UpActivated',
        'keys': {
            'product': 'aFRR_UpActivated'
        }
    }]
    extra_keys = [{'name': 'product', 'type': 'string'}]
    extra_value = {'name': 'amount', 'type': 'number'}
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000',
            format="json",
            property="result.records",
            name="fact_afrr"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('afrr'),
        # Normalize/unpivot:
        unpivot(unpivoting_fields, extra_keys, extra_value),
        add_computed_field([
            dict(target=dict(name='PriceArea', type='string'),
                 operation='constant',
                 with_='DK1'),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_='dummy'),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_='dummy')
        ]),
        add_price,
        delete_fields(fields=[
            'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK',
            'aFRR_UpPriceEUR'
        ]),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_afrr',
            title='Automatic Frequency Restoration Reserves',
            source=
            'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c'
        ),
        printer(),
        dump_to_path('afrr_data'))
    flow.process()
def main(package_url):
    jenkins_user_token = ckan_manager.get_jenkins_token(
        'ckan-cloud-operator-jenkins-creds')
    package_url = package_url.replace(
        'https://', 'https://{}:{}@'.format(*jenkins_user_token))
    stats_rows = []
    Flow(load(package_url), aggregate_stats(stats_rows),
         dump_to_path('data/aggregate_access_logs')).process()
    Flow((row for row in stats_rows),
         dump_to_path('data/aggregate_access_logs_stats'),
         printer()).process()
Beispiel #24
0
def flow(parameters, *_):
    files_dump_to_path = parameters['files_dump_to_path']
    data_dump_to_path = parameters.get('data_dump_to_path')

    def _download_gdrive_data():
        stats = defaultdict(int)
        file_sources = parameters['file_sources']
        folder_id = parameters['google_drive_csv_folder_id']
        files_dir = os.path.join(files_dump_to_path, "files")
        os.makedirs(files_dir, exist_ok=True)
        client = get_client()
        existing_files = {}
        if os.path.exists(os.path.join(files_dump_to_path, "datapackage.json")):
            for row in Flow(load(os.path.join(files_dump_to_path, "datapackage.json"))).results()[0][0]:
                existing_files[row["name"]] = row
        for id, name, version in list_files(client, folder_id):
            source = file_sources.get(name)
            if source:
                assert name.endswith(".csv"), "only csv file sources are supported"
                stats['relevant_source_files'] += 1
                row = {"id": id, "name": name, "version": version, "source": source, "resource_name": "%s__%s" % (source, stats['relevant_source_files'])}
                yield row
                if (
                        os.path.exists(os.path.join(files_dump_to_path, "files", name))
                        and name in existing_files and existing_files[name]["id"] == id and existing_files[name]["version"] == version
                ):
                    logging.info("existing file, will not redownload: %s" % name)
                else:
                    logging.info("downloading file: %s" % name)
                    get_file(client, id, os.path.join(files_dump_to_path, "files", name))
        if stats['relevant_source_files'] != len(file_sources):
            raise Exception("source files mismatch")

    files_flow = Flow(
        _download_gdrive_data(),
        update_resource(-1, name="gdrive_data_files", path="gdrive_data_files.csv", **{"dpp:streaming": True}),
        dump_to_path(files_dump_to_path),
        printer()
    )
    data_flow_args = []
    for file_row in files_flow.results()[0][0]:
        data_flow_args += [
            load(os.path.join(files_dump_to_path, "files", file_row["name"]),
                 strip=False, infer_strategy=load.INFER_STRINGS, deduplicate_headers=True,
                 cast_strategy=load.CAST_TO_STRINGS, on_error=ignore, limit_rows=parameters.get("limit_rows"),
                 encoding="utf-8"),
            update_resource(-1, name=file_row["resource_name"], path=file_row["name"], **{"dpp:streaming": True})
        ]
    if data_dump_to_path:
        data_flow_args += [
            dump_to_path(data_dump_to_path)
        ]
    return Flow(*data_flow_args)
Beispiel #25
0
def test_validate():
    from dataflows import Flow, validate, set_type, printer, ValidationError, exceptions

    def adder(row):
        row['a'] += 0.5
        row['a'] = str(row['a'])

    f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'),
             adder, validate(), printer())

    with pytest.raises(exceptions.ProcessorError) as excinfo:
        f.process()
    assert isinstance(excinfo.value.cause, ValidationError)
Beispiel #26
0
def test_update_resource():
    from dataflows import Flow, printer, update_resource

    f = Flow(*[({
        k: x
    } for x in range(10)) for k in 'abcdef'],
             update_resource(['res_1', 'res_3', 'res_5'], source='thewild'),
             printer())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert dp.descriptor['resources'][0]['source'] == 'thewild'
    assert dp.descriptor['resources'][2]['source'] == 'thewild'
    assert dp.descriptor['resources'][4]['source'] == 'thewild'
Beispiel #27
0
def test_exception_in_generator():
    from dataflows import Flow, printer

    class MyException(Exception):
        pass

    def generator():
        for i in range(5):
            raise MyException()
            yield {"i": i}

    with pytest.raises(MyException):
        Flow(generator(), printer()).process()
Beispiel #28
0
def update_dataset():
    flow = Flow(
        # Load inputs
        load(f'{BASE_URL}{CONFIRMED}'),
        load(f'{BASE_URL}{RECOVERED}'),
        load(f'{BASE_URL}{DEATH}'),
        checkpoint('load_data'),
        # Process them (if necessary)
        # Save the results
        add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''),
        printer(),
        dump_to_path(),
    )
    flow.process()
Beispiel #29
0
def test_validate():
    from dataflows import Flow, validate, set_type, printer, ValidationError

    def adder(row):
        row['a'] += 0.5
        row['a'] = str(row['a'])

    f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'),
             adder, validate(), printer())
    try:
        _ = f.process()
        assert False
    except ValidationError:
        pass
def get_db_test_row(version=None,
                    field_name=None,
                    value=None,
                    where=None,
                    show_fields=None,
                    limit_rows=10,
                    db_dump_to_path=None):
    if not where:
        where = []
        if version:
            where.append("data->>'version' = '%s'" % version)
        if field_name and value:
            where.append("data->>'%s' = '%s'" % (field_name, value))
        where = " and ".join(where)
    if not show_fields:
        if field_name:
            show_fields = [field_name]
        else:
            show_fields = []
    Flow(
        load_from_db.flow({
            "where":
            where,
            "limit_rows":
            limit_rows,
            **({
                "dump_to_path": db_dump_to_path
            } if db_dump_to_path else {}),
        }),
        add_gps_coordinates.flow({
            "source_fields":
            get_parameters_from_pipeline_spec(
                "pipeline-spec.yaml", "corona_data_collector",
                "corona_data_collector.add_gps_coordinates")["source_fields"],
            "workplace_source_fields":
            get_parameters_from_pipeline_spec(
                "pipeline-spec.yaml", "corona_data_collector",
                "corona_data_collector.add_gps_coordinates")
            ["workplace_source_fields"],
            "get-coords-callback":
            lambda street, city: (random.uniform(29, 34), random.uniform(
                34, 36), int(street != city))
        }),
        export_corona_bot_answers.flow({
            "destination_output":
            "data/corona_data_collector/destination_output"
        }),
        printer(fields=["__id", "__created", *show_fields]),
    ).process()