def flow(*args): is_dpp = len(args) > 3 return Flow( load( 'https://migdar-internal-search.odata.org.il/__data/search_import/index.csv', encoding='utf-8', http_session=get_migdar_session()), update_resource('index', name='search_import_index', path='search_import_index.csv'), load_from_gdrive_files, update_resource('search_import_index', name='search_import', path='search_import.csv', schema={ 'fields': [{ 'name': n, 'type': 'string' } for n in SEARCH_IMPORT_FIELD_NAMES] }, **{'dpp:streaming': True}), printer(num_rows=20, tablefmt='plain' if is_dpp else 'html', fields=['migdar_id', 'pubyear', 'title']), dump_to_path('data/search_import_from_gdrive'))
def join_unique_records(*args): is_dpp = len(args) > 3 return Flow( load('data/search_import_from_gdrive/datapackage.json', resources=['search_import']), load('data/search_results/unique_records.csv', resources=['unique_records']), set_type('migdar_id', type='string', resources=['unique_records', 'search_import']), join(source_name='search_import', source_key=['migdar_id'], target_name='unique_records', target_key=['migdar_id'], fields={ f'gd_{field}': { 'name': field } for field in SEARCH_IMPORT_FIELD_NAMES }, full=False), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['migdar_id']), dump_to_path('data/unique_records_full'), update_resource(None, **{'dpp:streaming': True}))
def flow(*args): is_dpp = len(args) > 3 return Flow( load('data/unique_records_full/datapackage.json', resources=['unique_records']), load('data/app_records_full/datapackage.json', resources=['search_app_records']), add_field('__revision', 'integer', REVISION), *(add_field(f['name'], f['type']) for f in STATUS_FIELDS), manage_revisions, *(dump_to_sql( { DB_TABLE: { 'resource-name': resource_name, 'mode': 'update', 'update_keys': KEY_FIELDS } }, DATAFLOWS_DB_ENGINE) for resource_name in ['unique_records', 'search_app_records']), *(add_field(f'rev_{name}', 'date') for name in ['last_updated_at', 'last_modified_at', 'created_at']), set_revisions, filter_rows(equals=[{ '__next_update_days': FILTER_NEXT_UPDATE_DAYS }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(), dump_to_path('data/publications_for_es'), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['doc_id']), update_resource(None, **{'dpp:streaming': True}))
def judges_flow(out_path): return Flow( get_tribunals(), update_resource(['res_1'], name='tribunals', path='tribunals.csv'), checkpoint('judges_tribunals'), get_judges(), update_resource(['res_2'], name='judges_list', path='judges_list.csv'), set_type('Is_In_Dimus_List', resources=['judges_list'], type='boolean'), checkpoint('judges_judges_list'), join('tribunals', ['Tribunal_Code'], 'judges_list', ['Tribunal_Code'], fields={ 'Tribunal_Type_Code': {}, 'Tribunal_Arkaa_Code': { 'name': 'Arkaa_Code' }, 'Tribunal_District_Code': { 'name': 'District_Code' }, 'Tribunal_Name': { 'name': 'Name' } }), fetch_judges_details, checkpoint('judges_details'), add_field('tribunal_type_name', 'string'), parse_judges_extra_details, checkpoint('judges_extra_details'), parse_judge_events, dump_to_path(out_path), printer(num_rows=1))
def flow(parameters, *_): logging.info('Pulling latest code from COVID19-ISRAEL github repo') logging.info('COVID19_ISRAEL_REPOSITORY=%s' % os.environ.get('COVID19_ISRAEL_REPOSITORY')) logging.info('pulling from origin/master') utils.subprocess_call_log( ['git', 'config', 'user.email', 'avid-covider-pipelines@localhost'], cwd='../COVID19-ISRAEL') utils.subprocess_call_log( ['git', 'config', 'user.name', 'avid-covider-pipelines'], cwd='../COVID19-ISRAEL') if utils.subprocess_call_log(['git', 'pull', 'origin', 'master'], cwd='../COVID19-ISRAEL') != 0: raise Exception('Failed to git pull') sha1 = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd='../COVID19-ISRAEL').decode().strip() return Flow( iter([{ 'sha1': sha1 }]), update_resource(-1, name='github_pull_covid19_israel', path='github_pull_covid19_israel.csv', **{'dpp:streaming': True}), printer(), dump_to_path( parameters.get('dump_to_path', 'data/github_pull_covid19_israel')))
def flow(*_): gcd = google_chrome_driver() download = gcd.download( 'https://data.gov.il/dataset/246d949c-a253-4811-8a11-41a137d3d613/resource/f004176c-b85f-4542-8901-7b3176f9a054/download/f004176c-b85f-4542-8901-7b3176f9a054.csv' ) return Flow( load(download, cast_strategy=load.CAST_TO_STRINGS), concatenate(_get_columns_mapping_dict(), target=dict(name='company-details')), set_type('id', type='string'), set_type('company_registration_date', type='date', format='%d/%m/%Y'), set_type('company_is_government', type='boolean', falseValues=['לא'], trueValues=['כן']), set_type('company_is_mafera', type='boolean', falseValues=['לא'], trueValues=['מפרה', 'התראה']), set_type('company_last_report_year', type='integer'), clear_bool_values, update_resource(**{'dpp:streaming': True}, resources='company-details'), set_primary_key(['id'], resources='company-details'), printer(), )
def prepare_locations(): prepare_addresses() return DF.Flow( DF.load('_cache_addresses/datapackage.json'), DF.add_field( 'address', 'string', lambda r: '{} {}{}'.format( r['street_name'], r['house_number'], r['letter'] or '')), DF.add_field( 'item', 'object', lambda r: dict(value=dict(lat=float(r['lat']), lon=float(r['lon']), arnona_zones=r['arnona_zones'], שם=r['address']), display=r['address'])), DF.sort_rows('{house_number}'), DF.delete_fields([ 'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address' ]), DF.join_with_self( 'concat', ['street_name'], dict(display=dict(name='street_name'), items=dict(name='item', aggregate='array'))), DF.add_field('sort_street_address', 'string', lambda r: sort_street_address(r['display'])), DF.sort_rows('{sort_street_address}'), DF.delete_fields(['sort_street_address']), DF.printer(), DF.dump_to_path('_cache_locations'), DF.checkpoint('_cache_locations')).results()[0][0]
def flow(*_): DF.Flow( DF.load(filename, name='welfare'), DF.add_field('activity_name', 'string', lambda r: r['שם השירות (ציבורי)']), DF.filter_rows(lambda r: r['activity_name']), DF.add_field( 'activity_description', 'array', lambda r: [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)'] ]), DF.add_field( 'history', 'array', lambda r: [ dict( year=2019, unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(), subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1]. strip(), subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[ 1].strip(), ) ]), DF.add_field('target_audience', 'array', splitter('אוכלוסייה')), DF.add_field('subject', 'array', splitter('תחום ההתערבות')), DF.add_field('intervention', 'array', splitter('אופן התערבות')), DF.select_fields(FIELDS), DF.add_field('publisher_name', 'string', 'משרד הרווחה'), DF.add_field('min_year', 'integer', 2019), DF.add_field('max_year', 'integer', 2019), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(), DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process() return DF.Flow( DF.load('tmp/activities-welfare/datapackage.json'), DF.update_resource(-1, **{'dpp:streaming': True}), )
def flow(parameters, *_): logging.info('Running COVID19-ISRAEL module %s' % parameters['module']) mtimes = {} sizes = {} hashes = {} for path in glob('../COVID19-ISRAEL/**', recursive=True): if os.path.isfile(path): mtimes[path] = os.path.getmtime(path) sizes[path] = os.path.getsize(path) hashes[path] = get_hash(path) if utils.subprocess_call_log(['python', '-u', '-m', parameters['module']], log_file=parameters.get('log_file'), cwd='../COVID19-ISRAEL') != 0: raise Exception('Failed to run COVID19-ISRAEL module %s' % parameters['module']) resource_name = parameters.get('resource_name', 'covid19_israel_updated_files') dump_to_path_name = parameters.get( 'dump_to_path', 'data/run_covid19_israel/last_updated_files/%s' % parameters['module']) printer_num_rows = parameters.get('printer_num_rows', 999) return Flow( get_updated_files(mtimes, sizes, hashes), update_resource(-1, name=resource_name, path='%s.csv' % resource_name, **{'dpp:streaming': True}), *([printer( num_rows=printer_num_rows)] if printer_num_rows > 0 else []), *([dump_to_path(dump_to_path_name)] if dump_to_path_name else []))
def es_dumper(resource_name, revision, path): now = time.time() return DF.Flow( update_pk('doc_id'), DF.add_field('revision', 'integer', default=revision), DF.add_field('score', 'number', default=1), DF.add_field('create_timestamp', 'number', now), my_dump_to_es(indexes={ 'migdar__' + resource_name: [{ 'resource-name': resource_name, 'revision': revision }] }, mapper_cls=BoostingMappingGenerator, index_settings={'index.mapping.coerce': True}, elasticsearch_options=dict(timeout=60)), DF.dump_to_path('data/{}'.format(path)), collate(revision), my_dump_to_es(indexes={ 'migdar__docs': [{ 'resource-name': resource_name, 'revision': revision }] }, mapper_cls=BoostingMappingGenerator, index_settings={'index.mapping.coerce': True}), DF.update_resource(None, **{'dpp:streaming': True}), DF.printer(), )
def flow(*_): print('reading companies...') return Flow( data_gov_il_resource.flow(companies), fix_values(), concatenate(_get_columns_mapping_dict(), target=dict(name='company-details')), set_type('id', type='string'), set_type('company_street_number', type='string'), set_type('company_registration_date', type='date', format='%d/%m/%Y'), set_type('company_is_government', type='boolean', falseValues=['לא'], trueValues=['כן']), set_type('company_is_mafera', type='boolean', falseValues=['לא'], trueValues=['מפרה', 'התראה']), set_type('company_last_report_year', type='integer'), set_type('company_postal_code', type='string'), clear_bool_values, update_resource(**{'dpp:streaming': True}, resources='company-details'), set_primary_key(['id'], resources='company-details'), printer(), )
def dump_print_flow(flow, dump_path, num_rows=1, fields=None, checkpoint_name=None): return Flow(flow, checkpoint(checkpoint_name) if checkpoint_name else None, dump_to_path(dump_path), printer(num_rows=num_rows, fields=fields))
def test_rename_resource2(): from dataflows import Flow, printer, update_resource f = Flow(({ 'a': x } for x in range(10)), update_resource(None, name='renamed'), printer()) results, dp, stats = f.results() print(dp.descriptor) assert dp.descriptor['resources'][0]['name'] == 'renamed'
def list_instances(): os.makedirs('data/list_instances', exist_ok=True) data = [] Flow((get_instance_row(instance) for instance in ckan_instance_manager.list_instances(full=True)), dump_to_json(data), dump_to_path('data/list_instances'), printer(num_rows=99999)).process() with open('data/list_instances.json', 'w') as f: json.dump(data, f)
def conference_csv(): flow = Flow( # Load inputs load( od19_base + od19_feedback, name='feedback', format='csv', ), load( od19_base + od19_analysis, name='analysis', format='csv', ), # Process them set_type("Anzahl.*", type='integer', resources='analysis'), delete_fields([ "Anzahl Auflistung", ".*\\(Formel\\)", ".*Duplikate", ], resources='analysis'), not_empty_groupcol, # Save the results add_metadata( name='opendatach19', title='''Opendata.ch/2019 Forum''', licenses=[{ "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0" }], maintainers=[{ "name": "Oleg Lavrovsky", "web": "https://datalets.ch/" }], views=[{ "name": "Groups", "resources": ["analysis"], "spec": { "group": "Alle " "Bedürfnisse" "", "series": ["Anzahl Auflistung (Zahl)"], "type": "bar" }, "specType": "simple", "title": "Topic counts" }]), printer(), validate(), dump_to_path('data/opendatach19'), ) flow.process()
def operator(name, params, pipeline): with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file: params['dgpConfig'].setdefault('publish', {})['allowed'] = True metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {}) metadata['title'] = name metadata['dag_id'] = pipeline['id'] metadata['updated_at'] = pipeline['__updated_at'] metadata['created_at'] = pipeline['__created_at'] for k, v in params.items(): if k.startswith('extra.'): set_dots(params['dgpConfig'], k, v) logging.info('\nCONFIGURATION:\n--------------\n%s', json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2)) yaml.dump(params['dgpConfig'], config_file) config_file.flush() config = Config(config_file.name) taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml') context = Context(config, taxonomy_registry) logging.getLogger().setLevel(logging.INFO) steps = [ FileLoaderDGP, LoaderDGP, PostLoaderDGP, TransformDGP, EnricherDGP, PublisherDGP, ] dgp = SimpleDGP( config, context, steps=steps ) ret = dgp.analyze() if not ret: logging.error('Errors:') logging.error('\n\t - '.join([str(x) for x in dgp.errors])) assert False # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', # json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2)) logging.info('Creating Flow') flow = dgp.flow() flow = Flow( flow, printer(tablefmt='html') ) logging.info('Running Flow') _, stats = flow.process() logging.info('Success') return stats
def flow(*_): return DF.Flow( [ dict(office=office, kind=kind) for office in offices for kind in report_kinds ], do_query(), DF.printer(), DF.update_resource(-1, **{'dpp:streaming': True}), )
def main(request_times_api_url): metadata = {} stats = collections.defaultdict(int) instance_stats = collections.defaultdict(int) Flow(get_builds(request_times_api_url, stats), aggregate_instance_stats(instance_stats, metadata), dump_to_path('data/aggregate_request_times')).process() Flow(get_instance_stats_data(instance_stats, metadata), dump_to_path('data/aggregate_request_times_stats'), printer(num_rows=1)).process()
def main(instance_ids_or_names, approve_code): instance_ids_or_names = [ i.strip() for i in instance_ids_or_names.split(',') if i.strip() ] approve_code = approve_code.strip() logs.info(instance_ids_or_names=instance_ids_or_names, approve_code=approve_code) Flow(delete_instances(instance_ids_or_names, approve_code), dump_to_path('data/delete_instances'), printer(num_rows=9999)).process()
def test_exception_in_generator(): from dataflows import Flow, printer, exceptions def generator(): for i in range(5): raise MyException() yield {"i": i} with pytest.raises(exceptions.ProcessorError) as excinfo: Flow(generator(), printer()).process() assert isinstance(excinfo.value.cause, MyException)
def test_update_schema(): from dataflows import Flow, printer, update_schema, validate f = Flow([['a', '-'], ['a', 0]], update_schema(-1, missingValues=['-']), validate(), printer()) results, dp, stats = f.results() print(dp.descriptor) assert results[0] == [ dict(col0='a', col1=None), dict(col0='a', col1=0), ]
def AFRR_Data(): unpivoting_fields = [{ 'name': 'aFRR_DownActivated', 'keys': { 'product': 'aFRR_DownActivated' } }, { 'name': 'aFRR_UpActivated', 'keys': { 'product': 'aFRR_UpActivated' } }] extra_keys = [{'name': 'product', 'type': 'string'}] extra_value = {'name': 'amount', 'type': 'number'} flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000', format="json", property="result.records", name="fact_afrr"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('afrr'), # Normalize/unpivot: unpivot(unpivoting_fields, extra_keys, extra_value), add_computed_field([ dict(target=dict(name='PriceArea', type='string'), operation='constant', with_='DK1'), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_='dummy'), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_='dummy') ]), add_price, delete_fields(fields=[ 'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK', 'aFRR_UpPriceEUR' ]), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_afrr', title='Automatic Frequency Restoration Reserves', source= 'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c' ), printer(), dump_to_path('afrr_data')) flow.process()
def main(package_url): jenkins_user_token = ckan_manager.get_jenkins_token( 'ckan-cloud-operator-jenkins-creds') package_url = package_url.replace( 'https://', 'https://{}:{}@'.format(*jenkins_user_token)) stats_rows = [] Flow(load(package_url), aggregate_stats(stats_rows), dump_to_path('data/aggregate_access_logs')).process() Flow((row for row in stats_rows), dump_to_path('data/aggregate_access_logs_stats'), printer()).process()
def flow(parameters, *_): files_dump_to_path = parameters['files_dump_to_path'] data_dump_to_path = parameters.get('data_dump_to_path') def _download_gdrive_data(): stats = defaultdict(int) file_sources = parameters['file_sources'] folder_id = parameters['google_drive_csv_folder_id'] files_dir = os.path.join(files_dump_to_path, "files") os.makedirs(files_dir, exist_ok=True) client = get_client() existing_files = {} if os.path.exists(os.path.join(files_dump_to_path, "datapackage.json")): for row in Flow(load(os.path.join(files_dump_to_path, "datapackage.json"))).results()[0][0]: existing_files[row["name"]] = row for id, name, version in list_files(client, folder_id): source = file_sources.get(name) if source: assert name.endswith(".csv"), "only csv file sources are supported" stats['relevant_source_files'] += 1 row = {"id": id, "name": name, "version": version, "source": source, "resource_name": "%s__%s" % (source, stats['relevant_source_files'])} yield row if ( os.path.exists(os.path.join(files_dump_to_path, "files", name)) and name in existing_files and existing_files[name]["id"] == id and existing_files[name]["version"] == version ): logging.info("existing file, will not redownload: %s" % name) else: logging.info("downloading file: %s" % name) get_file(client, id, os.path.join(files_dump_to_path, "files", name)) if stats['relevant_source_files'] != len(file_sources): raise Exception("source files mismatch") files_flow = Flow( _download_gdrive_data(), update_resource(-1, name="gdrive_data_files", path="gdrive_data_files.csv", **{"dpp:streaming": True}), dump_to_path(files_dump_to_path), printer() ) data_flow_args = [] for file_row in files_flow.results()[0][0]: data_flow_args += [ load(os.path.join(files_dump_to_path, "files", file_row["name"]), strip=False, infer_strategy=load.INFER_STRINGS, deduplicate_headers=True, cast_strategy=load.CAST_TO_STRINGS, on_error=ignore, limit_rows=parameters.get("limit_rows"), encoding="utf-8"), update_resource(-1, name=file_row["resource_name"], path=file_row["name"], **{"dpp:streaming": True}) ] if data_dump_to_path: data_flow_args += [ dump_to_path(data_dump_to_path) ] return Flow(*data_flow_args)
def test_validate(): from dataflows import Flow, validate, set_type, printer, ValidationError, exceptions def adder(row): row['a'] += 0.5 row['a'] = str(row['a']) f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'), adder, validate(), printer()) with pytest.raises(exceptions.ProcessorError) as excinfo: f.process() assert isinstance(excinfo.value.cause, ValidationError)
def test_update_resource(): from dataflows import Flow, printer, update_resource f = Flow(*[({ k: x } for x in range(10)) for k in 'abcdef'], update_resource(['res_1', 'res_3', 'res_5'], source='thewild'), printer()) results, dp, stats = f.results() print(dp.descriptor) assert dp.descriptor['resources'][0]['source'] == 'thewild' assert dp.descriptor['resources'][2]['source'] == 'thewild' assert dp.descriptor['resources'][4]['source'] == 'thewild'
def test_exception_in_generator(): from dataflows import Flow, printer class MyException(Exception): pass def generator(): for i in range(5): raise MyException() yield {"i": i} with pytest.raises(MyException): Flow(generator(), printer()).process()
def update_dataset(): flow = Flow( # Load inputs load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), # Process them (if necessary) # Save the results add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''), printer(), dump_to_path(), ) flow.process()
def test_validate(): from dataflows import Flow, validate, set_type, printer, ValidationError def adder(row): row['a'] += 0.5 row['a'] = str(row['a']) f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'), adder, validate(), printer()) try: _ = f.process() assert False except ValidationError: pass
def get_db_test_row(version=None, field_name=None, value=None, where=None, show_fields=None, limit_rows=10, db_dump_to_path=None): if not where: where = [] if version: where.append("data->>'version' = '%s'" % version) if field_name and value: where.append("data->>'%s' = '%s'" % (field_name, value)) where = " and ".join(where) if not show_fields: if field_name: show_fields = [field_name] else: show_fields = [] Flow( load_from_db.flow({ "where": where, "limit_rows": limit_rows, **({ "dump_to_path": db_dump_to_path } if db_dump_to_path else {}), }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "workplace_source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates") ["workplace_source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform( 34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), printer(fields=["__id", "__created", *show_fields]), ).process()