def test_filter_rows(): from dataflows import filter_rows f = Flow( [ { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, { 'a': 1, 'b': 4 }, { 'a': 2, 'b': 4 }, ], filter_rows(equals=[dict(a=1)]), filter_rows(not_equals=[dict(b=3)]), ) results, _, _ = f.results() assert results[0][0] == dict(a=1, b=4) assert len(results[0]) == 1 assert len(results) == 1
def test_example_8(): from dataflows import Flow, load, dump_to_path def find_double_winners(package): # Remove the emmies resource - we're going to consume it now package.pkg.remove_resource('emmies') # Must yield the modified datapackage yield package.pkg # Now iterate on all resources resources = iter(package) # Emmies is the first - read all its data and create a set of winner names emmy = next(resources) emmy_winners = set( map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy))) # Oscars are next - filter rows based on the emmy winner set academy = next(resources) yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners, academy) f = Flow( # Emmy award nominees and winners load('data/emmy.csv', name='emmies'), # Academy award nominees and winners load('data/academy.csv', encoding='utf8', name='oscars'), find_double_winners, dump_to_path('out/double_winners')) _ = f.process()
def test_example_5(): from dataflows import Flow, set_type, dump_to_path f = Flow(country_population(), set_type('population', type='number', groupChar=','), dump_to_path('out/country_population')) _ = f.process()
def test_load_duplicate_headers_with_deduplicate_headers_flag(): from dataflows import load flow = Flow(load('data/duplicate_headers.csv', deduplicate_headers=True), ) data, package, stats = flow.results() assert package.descriptor['resources'][0]['schema']['fields'] == [ { 'name': 'header1', 'type': 'string', 'format': 'default' }, { 'name': 'header2 (1)', 'type': 'string', 'format': 'default' }, { 'name': 'header2 (2)', 'type': 'string', 'format': 'default' }, ] assert data == [[ { 'header1': 'value1', 'header2 (1)': 'value2', 'header2 (2)': 'value3' }, ]]
def test_example_7(): from dataflows import Flow, load, dump_to_path def add_is_guitarist_column(package): # Add a new field to the first resource package.pkg.descriptor['resources'][0]['schema']['fields'].append( dict(name='is_guitarist', type='boolean')) # Must yield the modified datapackage yield package.pkg # Now iterate on all resources resources = iter(package) beatles = next(resources) def f(row): row['is_guitarist'] = row['instrument'] == 'guitar' return row yield map(f, beatles) f = Flow( # Same one as above load('data/beatles.csv'), add_is_guitarist_column, dump_to_path('out/beatles_guitarists')) _ = f.process()
def test_filter_rows_callable(): from dataflows import filter_rows f = Flow( [ { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, { 'a': 1, 'b': 4 }, { 'a': 2, 'b': 4 }, ], filter_rows(condition=lambda row: row['a'] > 1 and row['b'] < 4), ) results, _, _ = f.results() assert results[0][0] == dict(a=2, b=3) assert len(results[0]) == 1 assert len(results) == 1
def test_concatenate(): from dataflows import concatenate f = Flow([ { 'a': 1, 'b': 2 }, { 'a': 2, 'b': 3 }, { 'a': 3, 'b': 4 }, ], [ { 'c': 4, 'd': 5 }, { 'c': 5, 'd': 6 }, { 'c': 6, 'd': 7 }, ], concatenate({ 'f1': ['a'], 'f2': ['b', 'c'], 'f3': ['d'] })) results, _, _ = f.results() assert results[0] == [{ 'f1': 1, 'f2': 2, 'f3': None }, { 'f1': 2, 'f2': 3, 'f3': None }, { 'f1': 3, 'f2': 4, 'f3': None }, { 'f1': None, 'f2': 4, 'f3': 5 }, { 'f1': None, 'f2': 5, 'f3': 6 }, { 'f1': None, 'f2': 6, 'f3': 7 }]
def test_example_75(): from dataflows import Flow, load, dump_to_path def add_is_guitarist_column_to_schema(package): # Add a new field to the first resource package.pkg.descriptor['resources'][0]['schema']['fields'].append(dict( name='is_guitarist', type='boolean' )) # Must yield the modified datapackage yield package.pkg yield from package def add_is_guitarist_column(row): row['is_guitarist'] = row['instrument'] == 'guitar' return row f = Flow( # Same one as above load('data/beatles.csv'), add_is_guitarist_column_to_schema, add_is_guitarist_column, dump_to_path('out/beatles_guitarists2') ) _ = f.process()
def test_example_3(): from dataflows import Flow f = Flow( country_population(), ) data, *_ = f.results()
def store_destination_output_package(destination_output, csv_temp_files): logging.info("Storing destination output package") os.makedirs(destination_output, exist_ok=True) logging.info("Writing to destination_output dir: " + destination_output) last_package = {} if os.path.exists(os.path.join(destination_output, "datapackage.json")): def _load_last_package(row): last_package[row['name']] = row yield row Flow( load(os.path.join(destination_output, "datapackage.json")), _load_last_package ).process() def _files_list(): for temp_filepath, name in csv_temp_files.items(): target_filepath = os.path.join(destination_output, name) shutil.move(temp_filepath, target_filepath) os.chmod(target_filepath, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH) size = os.path.getsize(target_filepath) hash = get_hash(target_filepath) last_row = last_package.get(name) if last_row and hash == last_row.get('hash') and size == last_row['size']: mtime = last_row['mtime'] else: mtime = datetime.datetime.fromtimestamp(os.path.getmtime(target_filepath)) yield {"name": name, "size": size, "mtime": mtime, "hash": hash} Flow( _files_list(), update_resource(-1, name='files_list', path='files_list.csv'), dump_to_path(destination_output), ).process()
def test_duplicate(): from dataflows import duplicate a = [ { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, { 'a': 3, 'b': 1 }, { 'a': 4, 'b': 1 }, ] f = Flow( a, duplicate(), ) results, _, _ = f.results() assert list(results[0]) == a assert list(results[1]) == a
def flow(self): from dataflows import Flow if self.flows: return Flow(self.flows[1], ) elif self.analyzers: return super().flow() else: return Flow()
def test_example_4(): from dataflows import Flow, set_type f = Flow(country_population(), set_type('population', type='number', groupChar=',')) data, dp, _ = f.results() print(data[0][:10])
def test_select_field(): from dataflows import select_fields f = Flow(data, select_fields(['y'])) results, dp, _ = f.results() for i in results[0]: assert list(i.keys()) == ['y'] assert dp.descriptor['resources'][0]['schema']['fields'] == \ [dict(name='y', type='string', format='default')]
def test_add_metadata(): from dataflows import add_metadata f = Flow( data, add_metadata(author='Adam Kariv') ) _, dp, _ = f.results() assert dp.descriptor['author'] == 'Adam Kariv'
def test_example_2(): from dataflows import Flow, load def titleName(row): row['name'] = row['name'].title() f = Flow(load('data/beatles.csv'), titleName) data, *_ = f.results()
def test_rename_resource2(): from dataflows import Flow, printer, update_resource f = Flow(({ 'a': x } for x in range(10)), update_resource(None, name='renamed'), printer()) results, dp, stats = f.results() print(dp.descriptor) assert dp.descriptor['resources'][0]['name'] == 'renamed'
def test_load_from_package(): from dataflows import dump_to_path, load Flow([{'foo': 'bar'}], dump_to_path('data/load_from_package')).process() ds = Flow(load('data/load_from_package/datapackage.json')).datastream() assert len(ds.dp.resources) == 1 assert [list(res) for res in ds.res_iter] == [[{'foo': 'bar'}]]
def operator(name, params, pipeline): with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file: params['dgpConfig'].setdefault('publish', {})['allowed'] = True metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {}) metadata['title'] = name metadata['dag_id'] = pipeline['id'] metadata['updated_at'] = pipeline['__updated_at'] metadata['created_at'] = pipeline['__created_at'] for k, v in params.items(): if k.startswith('extra.'): set_dots(params['dgpConfig'], k, v) logging.info('\nCONFIGURATION:\n--------------\n%s', json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2)) yaml.dump(params['dgpConfig'], config_file) config_file.flush() config = Config(config_file.name) taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml') context = Context(config, taxonomy_registry) logging.getLogger().setLevel(logging.INFO) steps = [ FileLoaderDGP, LoaderDGP, PostLoaderDGP, TransformDGP, EnricherDGP, PublisherDGP, ] dgp = SimpleDGP( config, context, steps=steps ) ret = dgp.analyze() if not ret: logging.error('Errors:') logging.error('\n\t - '.join([str(x) for x in dgp.errors])) assert False # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', # json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2)) logging.info('Creating Flow') flow = dgp.flow() flow = Flow( flow, printer(tablefmt='html') ) logging.info('Running Flow') _, stats = flow.process() logging.info('Success') return stats
def test_unpivot_any_resources(): from dataflows import unpivot, validate data1 = [ dict([('name', 'ike{}'.format(i))] + [(str(year), year + i) for year in range(1990, 2020, 10)]) for i in range(5) ] data2 = [ dict([('city', 'mike{}'.format(i))] + [(str(year), year + i) for year in range(2050, 2080, 10)]) for i in range(5) ] f = Flow( data1, data2, unpivot([dict(name='([0-9]+)', keys=dict(year='\\1'))], [dict(name='year', type='integer')], dict(name='amount', type='integer')), validate()) results, _, _ = f.results() assert results[0] == [ dict(zip(['name', 'year', 'amount'], r)) for r in [ ['ike0', 1990, 1990], ['ike0', 2000, 2000], ['ike0', 2010, 2010], ['ike1', 1990, 1991], ['ike1', 2000, 2001], ['ike1', 2010, 2011], ['ike2', 1990, 1992], ['ike2', 2000, 2002], ['ike2', 2010, 2012], ['ike3', 1990, 1993], ['ike3', 2000, 2003], ['ike3', 2010, 2013], ['ike4', 1990, 1994], ['ike4', 2000, 2004], ['ike4', 2010, 2014], ] ] assert results[1] == [ dict(zip(['city', 'year', 'amount'], r)) for r in [ ['mike0', 2050, 2050], ['mike0', 2060, 2060], ['mike0', 2070, 2070], ['mike1', 2050, 2051], ['mike1', 2060, 2061], ['mike1', 2070, 2071], ['mike2', 2050, 2052], ['mike2', 2060, 2062], ['mike2', 2070, 2072], ['mike3', 2050, 2053], ['mike3', 2060, 2063], ['mike3', 2070, 2073], ['mike4', 2050, 2054], ['mike4', 2060, 2064], ['mike4', 2070, 2074], ] ]
def test_load_override_schema_and_fields(): from dataflows import load flow = Flow( load('data/beatles_age.csv', override_schema={ 'title': 'title', 'missingValues': ['ringo'], }, override_fields={ 'age': { 'type': 'string' }, }), ) data, package, stats = flow.results() assert package.descriptor == { 'profile': 'data-package', 'resources': [{ 'format': 'csv', 'name': 'beatles_age', 'path': 'beatles_age.csv', 'profile': 'tabular-data-resource', 'schema': { 'fields': [{ 'format': 'default', 'name': 'name', 'type': 'string' }, { 'format': 'default', 'name': 'age', 'type': 'string' }], 'missingValues': ['ringo'], 'title': 'title', } }] } assert data == [[ { 'name': 'john', 'age': '18' }, { 'name': 'paul', 'age': '16' }, { 'name': 'george', 'age': '17' }, { 'name': None, 'age': '22' }, ]]
def conference_csv(): flow = Flow( # Load inputs load( od19_base + od19_feedback, name='feedback', format='csv', ), load( od19_base + od19_analysis, name='analysis', format='csv', ), # Process them set_type("Anzahl.*", type='integer', resources='analysis'), delete_fields([ "Anzahl Auflistung", ".*\\(Formel\\)", ".*Duplikate", ], resources='analysis'), not_empty_groupcol, # Save the results add_metadata( name='opendatach19', title='''Opendata.ch/2019 Forum''', licenses=[{ "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0" }], maintainers=[{ "name": "Oleg Lavrovsky", "web": "https://datalets.ch/" }], views=[{ "name": "Groups", "resources": ["analysis"], "spec": { "group": "Alle " "Bedürfnisse" "", "series": ["Anzahl Auflistung (Zahl)"], "type": "bar" }, "specType": "simple", "title": "Topic counts" }]), printer(), validate(), dump_to_path('data/opendatach19'), ) flow.process()
def decp_processing(): flow = Flow( # Chargement du CSV suite à la conversion depuis JSON load("decp.csv"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), # Tri par rootId et seq pour préparer à la création de donneesActuelles sort_rows('{rootId}:{seq}', resources=0, reverse=True), donnees_actuelles, # rootId et seq peuvent maintenant être supprimés delete_fields(["rootId", "seq"], resources=0, regex=False), sort_rows('{datePublicationDonnees}', resources=0, reverse=True), # Nouvelle table dédiée aux marchés, sans données sur les titulaires print("Création de la table dédiée aux marchés..."), duplicate(source="decp", target_name="decp-sans-titulaires", target_path="decp-sans-titulaires.csv", duplicate_to_end=True), delete_fields([ "titulaire.id", "titulaire.denominationSociale", "titulaire.typeIdentifiant" ], resources="decp-sans-titulaires", regex=False), set_primary_key(["uid"], resources="decp-sans-titulaires"), deduplicate(), # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données # print("Téléchargement des données tabulaires précédentes..."), # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"), # set_type("acheteur.id", type="string"), # set_type("titulaire.id", type="string"), # set_type("codeCPV", type="string"), # set_type("lieuExecution.code", type="string"), # delete_fields(["rowid"], resources="previous-decp", regex=False), # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."), # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]), # Chargement des précédentes données dédiées aux titulaires print("Chargement des données titulaires..."), load("decp-titulaires.csv", name="decp-titulaires"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), set_type("departement", type="string"), set_type("codeAPE", type="string"), print("Enregistrement des données sur le disque..."), dump_to_path("decp")) flow.process()
def test_example_1(): from dataflows import Flow data = [{'data': 'Hello'}, {'data': 'World'}] def lowerData(row): row['data'] = row['data'].lower() f = Flow(data, lowerData) data, *_ = f.results()
def spew_flow(flow, ctx: ProcessorContext): flow = Flow( update_package(**ctx.datapackage), load((ctx.datapackage, ctx.resource_iterator)), flow, ) datastream = flow.datastream() ctx.datapackage = datastream.dp.descriptor ctx.resource_iterator = datastream.res_iter ctx.stats = MergeableStats(datastream.stats, ctx.stats)
def main(request_times_api_url): metadata = {} stats = collections.defaultdict(int) instance_stats = collections.defaultdict(int) Flow(get_builds(request_times_api_url, stats), aggregate_instance_stats(instance_stats, metadata), dump_to_path('data/aggregate_request_times')).process() Flow(get_instance_stats_data(instance_stats, metadata), dump_to_path('data/aggregate_request_times_stats'), printer(num_rows=1)).process()
def test_expected_contact_with_patient(): print("test_expected_contact_with_patient") back_from_abroad_db = [169603, 169632, 169813] contact_with_patient_db = [10722, 10715, 10697] Flow( load_from_db.flow({ "where": "id in (%s)" % ", ".join(map(str, back_from_abroad_db + contact_with_patient_db)) }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform( 34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), ).process() contact_with_patient_key = values_to_convert['insulation_status'][ 'contact-with-patient'] back_from_abroad_key = values_to_convert['insulation_status'][ 'back-from-abroad'] contact_with_patient_array = [] back_from_abroad_array = [] counts = {"contact_with_patient": 0, "back_from_abroad": 0} def _test(row): if int(row["isolation"]) == contact_with_patient_key: counts["contact_with_patient"] += 1 contact_with_patient_array.append(int(row["id"])) if int(row["isolation"]) == back_from_abroad_key: assert int(row["id"]) in back_from_abroad_db counts["back_from_abroad"] += 1 back_from_abroad_array.append(int(row["id"])) Flow( load( 'data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv' ), load( 'data/corona_data_collector/destination_output/corona_bot_answers_22_3_2020_with_coords.csv' ), _test, ).process() assert 3 == counts["contact_with_patient"], str(counts) assert 3 == counts["back_from_abroad"], str(counts) assert set(back_from_abroad_array) == set(back_from_abroad_db) assert set(contact_with_patient_array) == set(contact_with_patient_db) print("OK")
def test_load_from_env_var(): import os from dataflows import load, dump_to_path Flow([{'foo': 'bar'}], dump_to_path('out/load_from_env_var')).process() os.environ['MY_DATAPACKAGE'] = 'out/load_from_env_var/datapackage.json' results, dp, _ = Flow(load('env://MY_DATAPACKAGE')).results() assert len(dp.resources) == 1 assert results == [[{'foo': 'bar'}]]
def test_sort_reverse_many_rows(): from dataflows import sort_rows f = Flow( ({'a': i, 'b': i % 5} for i in range(1000)), sort_rows(key='{b}{a}', reverse=True, batch_size=0), ) results, _, _ = f.results() results = results[0] assert results[0:2] == [{'a': 999, 'b': 4}, {'a': 994, 'b': 4}] assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}]
def test_update_schema(): from dataflows import Flow, printer, update_schema, validate f = Flow([['a', '-'], ['a', 0]], update_schema(-1, missingValues=['-']), validate(), printer()) results, dp, stats = f.results() print(dp.descriptor) assert results[0] == [ dict(col0='a', col1=None), dict(col0='a', col1=0), ]
bond_uk = Flow( add_metadata( name="bond-yields-uk-10y", title= "10y UK Government Bond Yields (long-term interest rate)", sources=[ { "name": "Bank of England", "path": "http://www.bankofengland.co.uk/boeapps/iadb/index.asp?Travel=NIxIRx&levels=1&XNotes=Y&C=DUS&G0Xtop.x=51&G0Xtop.y=7&XNotes2=Y&Nodes=X41514X41515X41516X41517X55047X76909X4051X4052X4128X33880X4053X4058&SectionRequired=I&HideNums=-1&ExtraInfo=true#BM", "title": "Bank of England" } ], licenses=[ { "id": "odc-pddl", "path": "http://opendatacommons.org/licenses/pddl/", "name": "public_domain_dedication_and_license" } ], views=[ { "name": "graph", "title": "Average yield from British Government Securities, 10 year Nominal Par Yield", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["Rate"]} } ], readme=readme() ), load( load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUQAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963', skip_rows=[1], headers=['Date', 'Rate'], format='csv', name='quarterly' ), load( load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUAAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963', skip_rows=[1], headers=['Year', 'Rate'], format='csv', name='annual' ), set_type('Date', resources='quarterly', type='date', format='any'), set_type('Rate', resources='quarterly', type='number', description='Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'), set_type('Year', resources='annual', type='date', format='any'), set_type('Rate', resources='annual', type='number', description='Annual average yield from British Government Securities, 10 year Nominal Par Yield'), update_resource('quarterly', **{'path':'data/quarterly.csv', 'dpp:streaming': True}), update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}), validate(), dump_to_path() )
finance_vix = Flow( add_metadata( name="finance-vix", title= "VIX - CBOE Volatility Index", homepage= 'http://www.cboe.com/micro/VIX/', sources=[ { "name": "CBOE VIX Page", "path": "http://www.cboe.com/micro/vix/historical.aspx", "title": "CBOE VIX Page" } ], licenses=[ { "id": "odc-pddl", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", 'name': "open_data_commons_public_domain_dedication_and_license_v1.0" } ], version="0.2.0", views=[ { "name": "graph", "title": "VIX - CBOE Volatility Index", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["VIX Close"]} } ], readme=readme() ), load( load_source='http://www.cboe.com/publish/ScheduledTask/MktData/datahouse/vixcurrent.csv', headers=2, name='vix-daily' ), set_type('Date', type='date', format='any'), update_resource('vix-daily', **{'title': 'VIX Daily', 'path':'data/vix-daily.csv', 'dpp:streaming': True}), validate() )
bond_us = Flow( add_metadata( name="bond-yields-us-10y", title="10 year US Government Bond Yields (long-term interest rate)", version="0.2.0", sources=[ { "name": "Federal Reserve (Release H.15)", "path": "http://www.federalreserve.gov/releases/h15/data.htm", "title": "Federal Reserve (Release H.15)" } ], licenses=[ { "id": "odc-pddl", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", 'name': "open_data_commons_public_domain_dedication_and_license_v1.0" } ], views=[ { "name": "graph", "title": "10 year US Government Bond Yields (Monthly granuarlity)", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["Rate"]} } ], readme=readme() ), load( load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn', skip_rows=[i+1 for i in range(6)], headers=['Date', 'Rate'], format='csv', name='monthly' ), set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'), set_type('Rate', type='number', description='Percent per year'), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), validate(), dump_to_path() )
gold_price_flow = Flow( add_metadata( name="gold-prices", title="Gold Prices", homepage='http://www.bundesbank.de', licenses=[ { "id": "odc-pddl", "name": "public_domain_dedication_and_license", "version": "1.0", "url": "http://opendatacommons.org/licenses/pddl/1.0/" } ], sources=[ { "name": "bundesbank-gold-prices", "path": "'http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its'", "title": "Bundesbank gold prices" } ], views=[ { "name": "graph", "title": "Gold Prices (Monthly in USD)", "specType": "simple", "spec": { "type": "lines-and-points", "group": "Date", "series": [ "Price" ] } } ], related=[ { "title": "Oil prices", "path": "/core/oil-prices", "publisher": "core", "formats": ["CSV", "JSON"] }, { "title": "Natural gas", "path": "/core/natural-gas", "publisher": "core", "formats": ["CSV", "JSON"] } ], version="0.2.0" ), load( load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its', skip_rows=[1, 2, 3, 4, 5, -1], headers=['Date', 'Price', 'Empty column'], format='csv', name='annual' ), extract_december_rows, load( load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its', skip_rows=[1, 2, 3, 4, 5, -1], headers=['Date', 'Price', 'Empty column'], format='csv', name='monthly' ), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}), set_type('Date', resources='annual', type='yearmonth'), set_type('Price', resources='annual', type='number'), set_type('Date', resources='monthly', type='yearmonth'), set_type('Price', resources='monthly', type='number'), validate(), delete_fields(['Empty column'], resources=None) )