def flow(self): flows = [step.flow() for step in self.steps] return Flow(*list(filter(lambda x: x is not None, flows)))
def main(): with tempfile.TemporaryDirectory() as tempdir: with open(os.path.join(tempdir, ".netrc"), "w") as f: f.write("machine %s\nlogin %s\npassword %s\n" % (DOMAIN, AUTH_USER, AUTH_PASSWORD)) HOME = os.environ["HOME"] os.environ["HOME"] = tempdir os.makedirs("data/corona_data_collector/gps_data_cache", exist_ok=True) utils.http_stream_download( "data/corona_data_collector/gps_data_cache/datapackage.json", { "url": "https://%s/data/corona_data_collector/gps_data_cache/datapackage.json" % DOMAIN }) utils.http_stream_download( "data/corona_data_collector/gps_data_cache/gps_data.csv", { "url": "https://%s/data/corona_data_collector/gps_data_cache/gps_data.csv" % DOMAIN }) Flow( download_gdrive_data.flow({ "limit_rows": 50000, "files_dump_to_path": "data/corona_data_collector/gdrive_data", "google_drive_csv_folder_id": "1pzAyk-uXy__bt1tCX4rpTiPZNmrehTOz", "file_sources": { "COVID-19-English.csv": "google", "COVID-19-Russian.csv": "google", "COVID-19-Hebrew.csv": "hebrew_google", "maccabi_updated.csv": "maccabi", } }), load_from_db.flow({ "where": "(id > 500 and id < 1000) or (id > 180000 and id < 185000) or (id > 600000 and id < 601000) or (id > 640000 and id < 641000) or (id > 670000)", # "filter_db_row_callback": _mock_version_28 }), # _mock_gender_other, add_gps_coordinates.flow({ "source_fields": utils.get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates") ["source_fields"], "workplace_source_fields": utils.get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates") ["workplace_source_fields"], "dump_to_path": "data/corona_data_collector/with_gps_data", "gps_datapackage_path": "data/corona_data_collector/gps_data_cache", "get-coords-callback": lambda street, city: (random.uniform( 29, 34), random.uniform(34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/corona_bot_answers" }), export_corona_bot_answers.flow({ "unsupported": True, "destination_output": "data/corona_data_collector/corona_bot_answers_unsupported" })).process() os.environ["HOME"] = HOME subprocess.check_call( ["python3", "-m", "src.utils.get_raw_data"], cwd="../COVID19-ISRAEL", env={ **os.environ, "GOOGLE_SERVICE_ACCOUNT_FILE": os.environ["GOOGLE_SERVICE_ACCOUNT_FILE"], "AVIDCOVIDER_LOCAL_PATH": os.getcwd() }) subprocess.check_call(["python3", "-m", "src.utils.preprocess_raw_data"], cwd="../COVID19-ISRAEL", env={**os.environ}) logging.info("Great Success!")
for router in yaml.load(routers): for route in router['routes']: routes.append({'router_name': router['name'], **route}) yield { 'name': router['name'], 'ready': router['ready'], 'routes': len(router['routes']), 'deployment_created_at': router['deployment'].get('created_at'), 'deployment_generation': router['deployment'].get('generation'), 'deployment_namespace': router['deployment'].get('namespace'), 'deployment_ready': router['deployment'].get('ready'), 'type': router['type'], 'dns': router['dns'], } def get_routes(): yield from routes with open('output.html', 'w') as f: with redirect_stdout(f): Flow( get_routers(), update_resource('res_1', name='routers', path='routers.csv'), get_routes(), update_resource('res_2', name='routes', path='routes.csv'), dump_to_path(), printer(num_rows=9999999, tablefmt='html'), ).process()
household_us = Flow( add_metadata( name="household-income-us-historical", title= "Income Limits for Each Fifth and Top 5 Percent of All Households: 1967 to 2016", description= "Households as of March of the following year. Income in current and 2016 CPI-U-RS adjusted dollars.", sources=[{ "path": "https://www2.census.gov", "title": "United States Census Bureau" }], licenses=[{ "id": "odc-pddl", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", 'name': "open_data_commons_public_domain_dedication_and_license_v1.0" }], version="0.3.0", views=[{ "name": "comparison-of-upper-limit-of-each-fifth-and-lower-limit-of-top-5-percent", "title": "Comparison of upper limit of each fifth and lower limit of top 5 percent (2016 dollars)", "resources": ["household-income-us-historical"], "specType": "simple", "spec": { "type": "line", "group": "Year", "series": ["Lowest", "Second", "Third", "Fourth", "Top 5 percent"] } }, { "name": "lowest-fifth-vs-top-5-percent", "title": "Ratio of lower limit of top 5 percent to upper limit of lowest fifth (2016 dollars)", "resources": [{ "name": "household-income-us-historical", "transform": [{ "type": "formula", "expressions": ["data['Top 5 percent']/data['Lowest']"], "asFields": ["Ratio"] }] }], "specType": "simple", "spec": { "type": "line", "group": "Year", "series": ["Ratio"] } }], readme=readme()), load( load_source= 'https://www2.census.gov/programs-surveys/cps/tables/time-series/historical-income-households/h01ar.xls', format='xls', sheet=1, encoding='utf-8', # remove first 6 rows. remove rows that contain data from 1967 - last year and 3 rows after. Finaly last row skip_rows=[ i + 1 for i in range(6 + datetime.datetime.now().year - 1966 + 3) ] + [-1], headers=[ 'Year', 'Number (thousands)', 'Lowest', 'Second', 'Third', 'Fourth', 'Top 5 percent' ], ), find_replace(fields=[{ 'name': 'Year', 'patterns': [{ 'find': '(\s?\(\d+\))|(\.0)', 'replace': '' }] }, { 'name': 'Fourth', 'patterns': [{ 'find': '\+|', 'replace': '' }] }], resources=0), update_resource( 0, **{ 'name': 'household-income-us-historical', 'path': 'data/household-income-us-historical.csv', 'dpp:streaming': True }), set_type('Year', type='year'), set_type('^(?!Y).+', type='number'), validate())
from avid_covider_pipelines.utils import get_parameters_from_pipeline_spec from dataflows import printer, Flow, load from .common import test_corona_bot_answers logging.basicConfig(level=logging.INFO) Flow( load_from_db.flow({ "where": "id in (180074, 180075, 676579, 676580)" }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec("pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform(34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), printer(fields=[ "__id", "__created", "main_age", "medical_staff_member", "engagement_source", "alias", "layout" ]) ).process() Flow( load("data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv"), load("data/corona_data_collector/destination_output/corona_bot_answers_29_4_2020_with_coords.csv"), test_corona_bot_answers( lambda row: (str(row["medical_staff_member"]), str(row["engagement_source"]), str(row["layout"])), { "180074": ["corona_bot_answers_25_3_2020_with_coords", "", "", ""],
def flow(parameters): return Flow( load_lazy_json(parameters.get('source')), duplicate(parameters.get('source'), parameters.get('target-name'), parameters.get('target-path'), parameters.get('batch_size', 1000)))
'name': 'serie-a', 'path': 'italym.php', 'key': 'I1', 'links': [], 'dataset-name': 'italian-serie-a', 'dataset-title': 'Italian Serie A (football)' }, { 'name': 'ligue-1', 'path': 'francem.php', 'key': 'F1', 'links': [], 'dataset-name': 'french-ligue-1', 'dataset-title': 'French Ligue 1 (football)' }] for league in leagues: meta = get_league_meta(league) processors = get_processors(meta) processors.append(set_type('Date', type='date', format='%d/%m/%y')), processors.append(dump_to_path(out_path='datasets/' + league['name'])) processors.append(printer()) processors = [ add_metadata(name=league['dataset-name'], title=league['dataset-title'], licenses=licenses, sources=sources, related=related_datasets, readme=readme % league['dataset-title'].replace(' (football)', '')) ] + processors Flow(*processors).process()
def london_gva(link): Flow(load(link, sheet=3), filter_gva, unpivot(unpivoting_fields, extra_keys, extra_value), remove_duplicates, set_format_and_name, dump_to_path(), printer(num_rows=1)).process()
def test_add_metadata(): from dataflows import add_metadata f = Flow(data, add_metadata(author='Adam Kariv')) _, dp, _ = f.results() assert dp.descriptor['author'] == 'Adam Kariv'
def test_load_empty_headers(): from dataflows import Flow, load, printer def ensure_type(t): def func(row): assert isinstance(row['a'], t) return func results, dp, stats = Flow(load('data/empty_headers.csv'), ensure_type(str)).results() assert results[0] == [{ 'a': 1, 'b': 2 }, { 'a': 2, 'b': 3 }, { 'a': 3, 'b': 4 }, { 'a': 5, 'b': 6 }] assert len(dp.resources[0].schema.fields) == 2 results, dp, stats = Flow(load('data/empty_headers.csv', validate=True), ensure_type(int)).results() assert results[0] == [{ 'a': 1, 'b': 2 }, { 'a': 2, 'b': 3 }, { 'a': 3, 'b': 4 }, { 'a': 5, 'b': 6 }] results, dp, stats = Flow( load('data/empty_headers.csv', force_strings=True), ensure_type(str)).results() assert results[0] == [{ 'a': '1', 'b': '2' }, { 'a': '2', 'b': '3' }, { 'a': '3', 'b': '4' }, { 'a': '5', 'b': '6' }] assert len(dp.resources[0].schema.fields) == 2 results, dp, stats = Flow( load('data/empty_headers.csv', force_strings=True, validate=True), ensure_type(str)).results() assert results[0] == [{ 'a': '1', 'b': '2' }, { 'a': '2', 'b': '3' }, { 'a': '3', 'b': '4' }, { 'a': '5', 'b': '6' }] assert len(dp.resources[0].schema.fields) == 2
def parse_dockerfiles(): gitlab_repos = {} def _parse_gitlab_repos(rows): if rows.res.name == 'ckan-cloud-instances': for row in rows: gitlab_repo = row['gitlab_repo'] if gitlab_repo in gitlab_repos: gitlab_repos[gitlab_repo]['instances'].append(row) else: gitlab_repos[gitlab_repo] = {'instances': [row]} yield row else: yield from rows def _get_dockerfile_from(dockerfile): if dockerfile: return [ line.replace('FROM ', '') for line in dockerfile.split('\n') if line.startswith('FROM') ][0] else: return None def _parse_ckan_extensions(rows): if rows.res.name == 'dockerfiles': for row in rows: row['ckan_exts'] = [] if row['dockerfile']: for line in row['dockerfile'].split('\n'): if 'https://github.com/' in line and '.git@' in line and '#egg=' in line: ext = line.split('https://github.com/')[1].split( '#egg=')[0].replace('.git@', '@') row['ckan_exts'].append(ext) if 'ckanext-s3filestore' in ext: row['ckanext-s3filestore'] = ext yield row else: yield from rows def _get_dockerfile_row(gitlab_repo_name, gitlab_repo): try: dockerfile = CkanGitlab()._get_file(gitlab_repo_name, 'Dockerfile') except Exception: dockerfile = None return { 'gitlab_repo': gitlab_repo_name, 'instances': [i['name'] for i in gitlab_repo['instances']], 'from': _get_dockerfile_from(dockerfile), 'dockerfile': dockerfile } def _parse_dockerfiles(package): package.pkg.add_resource({ 'name': 'dockerfiles', 'path': 'dockerfiles.csv', 'schema': { 'fields': [{ 'name': 'gitlab_repo', 'type': 'string' }, { 'name': 'instances', 'type': 'array' }, { 'name': 'from', 'type': 'string' }, { 'name': 'dockerfile', 'type': 'string' }] } }) yield package.pkg yield from package yield (_get_dockerfile_row(gitlab_repo_name, gitlab_repo) for gitlab_repo_name, gitlab_repo in gitlab_repos.items()) return Flow( _parse_gitlab_repos, _parse_dockerfiles, checkpoint('ckan_images_dockerfiles'), add_field('ckan_exts', 'array'), add_field('ckanext-s3filestore', 'string'), _parse_ckan_extensions, )
yield from package yield (_get_dockerfile_row(gitlab_repo_name, gitlab_repo) for gitlab_repo_name, gitlab_repo in gitlab_repos.items()) return Flow( _parse_gitlab_repos, _parse_dockerfiles, checkpoint('ckan_images_dockerfiles'), add_field('ckan_exts', 'array'), add_field('ckanext-s3filestore', 'string'), _parse_ckan_extensions, ) def main_flow(prefix, operator): return Flow( load(f'data/{prefix}/resources/datapackage.json', resources=['ckan-cloud-instances']), add_field('gitlab_repo', 'string'), get_gitlab_repo, parse_dockerfiles(), ) if __name__ == '__main__': prefix = os.environ['DATAPACKAGE_PREFIX'] operator = os.environ.get('CKAN_CLOUD_OPERATOR_BIN', 'ckan-cloud-operator') Flow(main_flow(prefix, operator), printer(num_rows=1, fields=['name', 'image', 'gitlab_repo', 'from']), dump_to_path(f'data/{prefix}/ckan_images')).process()
Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join( source_name='time_series_19-covid-Confirmed', source_key=['Province/State', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Date'], fields=dict(Confirmed={ 'name': 'Case', 'aggregate': 'first' }) ), join( source_name='time_series_19-covid-Recovered', source_key=['Province/State', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' }) ), add_computed_field( target={'name': 'Deaths', 'type': 'number'}, operation='format', with_='{Case}' ), delete_fields(['Case']), update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'), dump_to_path() ).results()[0]
'image': deployment['spec']['template']['spec']['containers'][0]['image'], 'datastore': '', 'db': '', 'solr': '', 'storage': '', 'creationTimestamp': str(deployment['metadata']['creationTimestamp']), 'generation': '', } return _processor def main_flow(prefix, operator): return Flow( cluster_info(operator), update_resource(['res_1'], name='cluster-info', path='cluster-info.csv'), checkpoint(f'{prefix}-cluster-info'), ckan_cloud_instances(operator), update_resource(['res_2'], name='ckan-cloud-instances', path='ckan-cloud-instances.csv'), ) if __name__ == '__main__': prefix = os.environ['DATAPACKAGE_PREFIX'] operator = os.environ.get('CKAN_CLOUD_OPERATOR_BIN', 'ckan-cloud-operator') Flow( main_flow(prefix, operator), printer(num_rows=1), dump_to_path(f'data/{prefix}/resources') ).process()
def flow(): data_path = 'data{}/'.format( '_samples' if os.environ.get('KNESSET_DATA_SAMPLES') else '') kns_knessetdates_sorted = [] mk_individual_factions = {} vote_discipline = {} all_mk_ids = set() aggregates = {} def get_vote_discipline_mk_ids(vote_id): return vote_discipline.get(vote_id, [set(), set()]) def process_voted_against_majority(rows): for row in rows: undisciplined_mk_ids, disciplined_mk_ids = vote_discipline.setdefault( row['vote_id'], [set(), set()]) if row['vote_majority'] in ['against', 'pro']: if row['voted_against_majority']: undisciplined_mk_ids.add(row['mk_id']) else: disciplined_mk_ids.add(row['mk_id']) def process_votes(votes): for vote in rows_counter('view_vote_rslts_hdr_approved', votes): vote_date = vote['vote_date'] undisciplined_mk_ids, disciplined_mk_ids = get_vote_discipline_mk_ids( vote['id']) for mk_id, faction_id in get_mk_faction_ids( all_mk_ids, mk_individual_factions, vote_date): knessetdate = get_knessetdate(kns_knessetdates_sorted, vote_date) agg = aggregates.setdefault(knessetdate['knesset'], {})\ .setdefault(knessetdate['plenum'], {})\ .setdefault(knessetdate['assembly'], {})\ .setdefault(knessetdate['pagra'], {})\ .setdefault(faction_id, {})\ .setdefault(mk_id, defaultdict(int)) if mk_id in undisciplined_mk_ids: agg['undisciplined_votes'] += 1 elif mk_id in disciplined_mk_ids: agg['disciplined_votes'] += 1 agg['total_votes'] += 1 def get_all_aggregates(): for knesset, aggs in aggregates.items(): for plenum, aggs in aggs.items(): for assembly, aggs in aggs.items(): for pagra, aggs in aggs.items(): for faction_id, aggs in aggs.items(): for mk_id, agg in aggs.items(): yield (knesset, plenum, assembly, pagra, faction_id, mk_id), agg def get_mk_aggregates(): for agg_key, agg in get_all_aggregates(): total_votes = agg.get('total_votes', 0) if total_votes > 0: undisciplined_votes_percent = int( agg.get('undisciplined_votes', 0) / total_votes * 100) disciplined_votes_percent = int( agg.get('disciplined_votes', 0) / total_votes * 100) knesset, plenum, assembly, pagra, faction_id, mk_id = agg_key yield dict( { 'undisciplined_votes': 0, 'disciplined_votes': 0, 'total_votes': 0, }, **agg, undisciplined_votes_percent=undisciplined_votes_percent, disciplined_votes_percent=disciplined_votes_percent, knesset=knesset, plenum=plenum, assembly=assembly, pagra=int(pagra), faction_id=faction_id, mk_id=mk_id) def get_aggregates(package: PackageWrapper): schema_fields = [ { 'name': 'knesset', 'type': 'integer' }, { 'name': 'plenum', 'type': 'integer' }, { 'name': 'assembly', 'type': 'integer' }, { 'name': 'pagra', 'type': 'integer' }, { 'name': 'faction_id', 'type': 'integer' }, { 'name': 'mk_id', 'type': 'integer' }, { 'name': 'undisciplined_votes', 'type': 'integer' }, { 'name': 'disciplined_votes', 'type': 'integer' }, { 'name': 'total_votes', 'type': 'integer' }, { 'name': 'undisciplined_votes_percent', 'type': 'integer' }, { 'name': 'disciplined_votes_percent', 'type': 'integer' }, ] package.pkg.add_resource({ 'name': 'mk_party_discipline_stats', 'path': 'mk_party_discipline_stats.csv', 'schema': { 'fields': schema_fields } }) yield package.pkg yield from package yield get_mk_aggregates() return Flow( load(data_path + 'members/mk_individual/datapackage.json', resources=['mk_individual_names']), process_rows_remove_resource( 'mk_individual_names', mk_individual_names_processor(all_mk_ids)), load(data_path + 'members/mk_individual/datapackage.json', resources=['mk_individual_factions']), process_rows_remove_resource( 'mk_individual_factions', mk_individual_factions_processor(mk_individual_factions)), load(data_path + 'knesset/kns_knessetdates/datapackage.json', resources=['kns_knessetdates']), process_rows_remove_resource( 'kns_knessetdates', kns_knessetdates_processor(kns_knessetdates_sorted)), load(data_path + 'people/mk_voted_against_majority/datapackage.json', resources=['mk_voted_against_majority']), process_rows_remove_resource('mk_voted_against_majority', process_voted_against_majority), load(data_path + 'votes/view_vote_rslts_hdr_approved/datapackage.json', resources=['view_vote_rslts_hdr_approved']), process_rows_remove_resource('view_vote_rslts_hdr_approved', process_votes), get_aggregates, dump_to_path('data/people/mk_party_discipline_stats'), )
from dataflows import Flow, add_computed_field def flow(parameters, *args): return Flow( add_computed_field(target=dict( name='geotype', type='string', constraints=dict(enum=["state", "nation"])), operation=lambda row: (row['geo'] == 'Germany') and 'nation' or 'state', resources=parameters["resources"])) # Entrypoint for running the flow directly, without Datapackage Pipelines if __name__ == '__main__': # Add a printer step and run the flow Flow(flow(), printer(num_rows=1, tablefmt='html')).process()
import os from dataflows import Flow from dataflows_xlsx import dump_to_path def get_data(): for i in range(10): yield {'i': i, 'foo': 'bar{}'.format(i)} Flow([{ 'i': i, 'foo': 'bar{}'.format(i) } for i in range(10)], dump_to_path('tests/data/test_dump_to_xlsx', format='xlsx')).process() assert os.path.isfile('tests/data/test_dump_to_xlsx/res_1.xlsx') assert os.path.isfile('tests/data/test_dump_to_xlsx/datapackage.json') assert os.path.getsize('tests/data/test_dump_to_xlsx/datapackage.json') > 200 assert os.path.getsize('tests/data/test_dump_to_xlsx/res_1.xlsx') > 2000 print('OK')
OIL_PRICES = Flow( add_metadata( name="oil-prices", title="Brent and WTI Spot Prices", descriptor=( "A variety of temporal granularities for Europe Brent and WTI " "(West Texas Intermediate) Spot Prices."), sources=[ { "name": "Daily Europe Brent Spot Price", "path": "https://www.eia.gov/dnav/pet/hist_xls/RBRTEd.xls", "title": "Daily Europe Brent Spot Price", }, { "name": "Weekly Europe Brent Spot Price", "path": "https://www.eia.gov/dnav/pet/hist_xls/RBRTEw.xls", "title": "Weekly Europe Brent Spot Price", }, { "name": "Monthly Europe Brent Spot Price", "path": "https://www.eia.gov/dnav/pet/hist_xls/RBRTEm.xls", "title": "Monthly Europe Brent Spot Price", }, { "name": "Annual Europe Brent Spot Price", "path": "https://www.eia.gov/dnav/pet/hist_xls/RBRTEa.xls", "title": "Annual Europe Brent Spot Price", }, { "name": "Daily Cushing, OK WTI Spot Price", "path": "http://www.eia.gov/dnav/pet/hist_xls/RWTCd.xls", "title": "Daily Cushing, OK WTI Spot Price", }, { "name": "Weekly Cushing, OK WTI Spot Price", "path": "http://www.eia.gov/dnav/pet/hist_xls/RWTCw.xls", "title": "Weekly Cushing, OK WTI Spot Price", }, { "name": "Monthly Cushing, OK WTI Spot Price", "path": "http://www.eia.gov/dnav/pet/hist_xls/RWTCm.xls", "title": "Monthly Cushing, OK WTI Spot Price", }, { "name": "Annual Cushing, OK WTI Spot Price", "path": "http://www.eia.gov/dnav/pet/hist_xls/RWTCa.xls", "title": "Annual Cushing, OK WTI Spot Price", }, ], licenses=[{ "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", }], keywords=["Oil", "Brent", "WTI", "Oil Prices", "eia", "oil eia"], views=[{ "name": "graph", "title": "Europe Brent Spot Price FOB (Dollars per Barrel)", "resourceName": "brent-day", "specType": "simple", "spec": { "type": "line", "group": "Date", "series": ["Brent Spot Price"], }, }], ), load( load_source="https://www.eia.gov/dnav/pet/hist_xls/RBRTEd.xls", format="xls", sheet=2, skip_rows=[1, 2, 3], headers=["Date", "Price"], ), load( load_source="https://www.eia.gov/dnav/pet/hist_xls/RBRTEw.xls", format="xls", sheet=2, skip_rows=[1, 2, 3], headers=["Date", "Price"], ), load( load_source="https://www.eia.gov/dnav/pet/hist_xls/RBRTEm.xls", format="xls", sheet=2, skip_rows=[1, 2, 3], headers=["Date", "Price"], ), load( load_source="https://www.eia.gov/dnav/pet/hist_xls/RBRTEa.xls", format="xls", sheet=2, skip_rows=[1, 2, 3], headers=["Date", "Price"], ), load( load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCd.xls", format="xls", sheet=2, skip_rows=[1, 2, 3], headers=["Date", "Price"], ), load( load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCw.xls", format="xls", sheet=2, skip_rows=[1, 2, 3], headers=["Date", "Price"], ), load( load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCm.xls", format="xls", sheet=2, skip_rows=[1, 2, 3], headers=["Date", "Price"], ), load( load_source="http://www.eia.gov/dnav/pet/hist_xls/RWTCa.xls", format="xls", sheet=2, skip_rows=[1, 2, 3], headers=["Date", "Price"], ), rename_resources, set_type("Date", resources=None, type="date", format="any"), validate(), printer(), filter_out_empty_rows, dump_to_path(), )
def postflow(self): return Flow(self.work())
Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ 'name': 'Date', 'patterns': [{ 'find': '/', 'replace': '-' }] }]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_covid19_confirmed_global', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_covid19_deaths_global', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Confirmed={ 'name': 'Case', 'aggregate': 'first' })), join(source_name='time_series_19-covid-Recovered', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_covid19_deaths_global', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' })), add_computed_field(target={ 'name': 'Deaths', 'type': 'number' }, operation='format', with_='{Case}'), delete_fields(['Case']), update_resource('time_series_covid19_deaths_global', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'), update_schema('time-series-19-covid-combined', missingValues=['None', ''], fields=[{ "format": "%Y-%m-%d", "name": "Date", "type": "date" }, { "format": "default", "name": "Country/Region", "type": "string" }, { "format": "default", "name": "Province/State", "type": "string" }, { "decimalChar": ".", "format": "default", "groupChar": "", "name": "Lat", "type": "number" }, { "decimalChar": ".", "format": "default", "groupChar": "", "name": "Long", "type": "number" }, { "format": "default", "groupChar": "", "name": "Confirmed", "title": "Cumulative total confirmed cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Recovered", "title": "Cumulative total recovered cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Deaths", "title": "Cumulative total deaths to date", "type": "integer" }]), checkpoint('processed_data'), # Duplicate the stream to create aggregated data duplicate(source='time-series-19-covid-combined', target_name='worldwide-aggregated', target_path='data/worldwide-aggregated.csv'), join_with_self(resource_name='worldwide-aggregated', join_key=['Date'], fields=dict(Date={'name': 'Date'}, Confirmed={ 'name': 'Confirmed', 'aggregate': 'sum' }, Recovered={ 'name': 'Recovered', 'aggregate': 'sum' }, Deaths={ 'name': 'Deaths', 'aggregate': 'sum' })), update_schema('worldwide-aggregated', missingValues=['None', ''], fields=[{ "format": "%Y-%m-%d", "name": "Date", "type": "date" }, { "format": "default", "groupChar": "", "name": "Confirmed", "title": "Cumulative total confirmed cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Recovered", "title": "Cumulative total recovered cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Deaths", "title": "Cumulative total deaths to date", "type": "integer" }]), checkpoint('processed_worldwide_data'), # Add daily increase rate field in the worldwide data calculate_increase_rate, # Create another resource with key countries pivoted duplicate(source='time-series-19-covid-combined', target_name='key-countries-pivoted', target_path='data/key-countries-pivoted.csv'), join_with_self(resource_name='key-countries-pivoted', join_key=['Date', 'Country/Region'], fields=dict(Date={'name': 'Date'}, Country={'name': 'Country/Region'}, Confirmed={ 'name': 'Confirmed', 'aggregate': 'sum' }, Recovered={ 'name': 'Recovered', 'aggregate': 'sum' }, Deaths={ 'name': 'Deaths', 'aggregate': 'sum' })), update_schema('key-countries-pivoted', missingValues=['None', ''], fields=[{ "format": "%Y-%m-%d", "name": "Date", "type": "date" }, { "format": "default", "name": "Country", "type": "string" }, { "format": "default", "groupChar": "", "name": "Confirmed", "title": "Cumulative total confirmed cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Recovered", "title": "Cumulative total recovered cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Deaths", "title": "Cumulative total deaths to date", "type": "integer" }]), checkpoint('processed_country_data'), # All countries aggregated duplicate(source='key-countries-pivoted', target_name='countries-aggregated', target_path='data/countries-aggregated.csv'), pivot_key_countries, delete_fields(['Country', 'Confirmed', 'Recovered', 'Deaths'], resources='key-countries-pivoted'), # Prepare data package (name, title) and add views update_package( name='covid-19', title='Novel Coronavirus 2019', views=[{ "title": "Total world to date", "resources": ["worldwide-aggregated"], "specType": "simple", "spec": { "group": "Date", "series": ["Confirmed", "Recovered", "Deaths"], "type": "line" } }, { "title": "Number of confirmed cases in key countries", "resources": ["key-countries-pivoted"], "specType": "simple", "spec": { "group": "Date", "series": [ "China", "US", "United_Kingdom", "Italy", "France", "Germany", "Spain", "Iran" ], "type": "line" } }, { "title": "Mortality rate in percentage", "resources": [{ "name": "worldwide-aggregated", "transform": [{ "type": "formula", "expressions": ["data['Deaths'] / data['Confirmed'] * 100 + '%'"], "asFields": ["Mortality rate"] }] }], "specType": "simple", "spec": { "group": "Date", "series": ["Mortality rate"], "type": "bar" } }, { "title": "Increase rate from previous day in confirmed cases worldwide", "resources": ["worldwide-aggregated"], "specType": "simple", "spec": { "group": "Date", "series": ["Increase rate"], "type": "bar" } }]), dump_to_path()).results()[0]
def flow(parameters): return Flow( add_computed_field(parameters.get('fields', []), resources=parameters.get('resources')), )
Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ 'name': 'Date', 'patterns': [{ 'find': '/', 'replace': '-' }] }]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_19-covid-Confirmed', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Confirmed={ 'name': 'Case', 'aggregate': 'first' })), join(source_name='time_series_19-covid-Recovered', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' })), add_computed_field(target={ 'name': 'Deaths', 'type': 'number' }, operation='format', with_='{Case}'), delete_fields(['Case']), update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'), update_package(name='covid-19', title='Novel Coronavirus 2019'), dump_to_path(), checkpoint('processed_data'), # Duplicate the stream to create aggregated data duplicate(source='time-series-19-covid-combined', target_name='worldwide-aggregated', target_path='worldwide-aggregated.csv'), join_with_self(resource_name='worldwide-aggregated', join_key=['Date'], fields=dict(Date={'name': 'Date'}, Confirmed={ 'name': 'Confirmed', 'aggregate': 'sum' }, Recovered={ 'name': 'Recovered', 'aggregate': 'sum' }, Deaths={ 'name': 'Deaths', 'aggregate': 'sum' })), dump_to_path()).results()[0]
def flow(parameters): return Flow(update_package(**parameters))
doc = row['document'] for (k, v) in RENAME_FIELDS.items(): doc[v] = doc.get(k, []) yield row def flow(*_): return Flow( filter_by_type, rename_fields, add_fields(FIELDS, 'string'), add_fields(ADDITIONAL_FIELDS, 'string'), parse_document, delete_fields([ 'document', 'pdf', 'other', 'num_files', 'parser_version', 'source', 's3_object_name' ]), ) if __name__ == '__main__': csv.field_size_limit(512 * 1024) Flow( load( '/var/datapackages/maya/maya_complete_notification_list/datapackage.json' ), flow(), printer(), ).process()
def postflow(self): return Flow(self.address_fixer(), )
def flow(parameters): return Flow( update_resource(parameters['source'], name=parameters['target']))
'קולות קוראים': 'קול קורא', 'תמיכות': 'מבחן תמיכה', } def process_kind(row): row['tender_type'] = KIND_MAPPING.get(row['tender_type_he'], row['tender_type_he']) row['tender_type_he'] = KIND_HE_MAPPING.get(row['tender_type_he'], row['tender_type_he']) def flow(*_): return Flow( fetch_results(), set_type('start_date', type='date', format='%d.%m.%Y'), set_type('tender_id', type='string'), set_type('tender_type', type='string'), process_kind, calculate_publication_id(2), set_primary_key(['publication_id']), update_resource(-1, name='jobiz', **{PROP_STREAMING: True}), ) if __name__ == '__main__': Flow( flow(), printer(), ).process()
def batch_flow(parameters): return Flow(*[flow(p) for p in parameters['batch']])
from pprint import pprint from dataflows import Flow, load from processors.pivot import pivot # Run flow flow = Flow( load('layouts/long.csv'), pivot(join_field='name', key_field='treatment', value_field='result'), ) results, package, stats = flow.results() print('[Data]\n') pprint(results[0]) print('\n[Meta]\n') pprint(package.descriptor)