def test_change_acl_on_s3_no_path_provided(s3_client, bucket): # Prepare paths paths = [ 'my/private/datasets/file_1.csv' 'my/private/datasets/file_2.csv' ] # Fill the S3 bucket for path in paths: s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read') # Set private ACL using the processor flow = Flow( load('data/data.csv'), change_acl_on_s3( bucket=bucket, acl='private', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process() # Assert everything is private now for path in paths: url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path) assert requests.get(url).status_code == 403
def generate_package(): package_flow = Flow( add_metadata( name="unicode-emojis", title="UTS #51 Unicode Emoji", descriptor=( "List of emojis available from the Unicode Consortium. " "More information can be found in the Unicode® Technical Standard #51." ), sources=[ { "name": "unicode-emoji", "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt", "title": "UTS #51 Unicode Emoji", }, ], licenses=[ { "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", } ], keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"], ), load(load_source="data/emojis.csv", format="csv",), validate(), dump_to_path(), ) package_flow.process()
def test_dump_to_s3_non_existent_bucket(s3_client, bucket): # Delete bucket s3_client.delete_bucket(Bucket=bucket) # Dump to S3 using the processor flow = Flow( load('data/data.csv'), dump_to_s3( bucket=bucket, acl='private', path='my/datapackage', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process() # Check datapackage.json content response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/datapackage.json') descriptor = json.loads(response['Body'].read().decode('utf-8')) assert descriptor['resources'][0]['schema']['fields'][0]['name'] == 'id' assert descriptor['resources'][0]['schema']['fields'][1]['name'] == 'name' # Check data.csv content response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/data.csv') contents = response['Body'].read().decode('utf-8') assert contents == 'id,name\r\n1,english\r\n2,中国人\r\n'
def test_change_acl_on_s3_handles_more_than_1000_files(s3_client, bucket): # Prepare paths paths = [] for index in range(1, 1101): path = 'my/private/datasets/file_%s.csv' % index paths.append(path) # Fill the S3 bucket for path in paths: s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read') # Set private ACL using the processor flow = Flow( load('data/data.csv'), change_acl_on_s3( bucket=bucket, acl='private', path='my/private/datasets', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process() # Assert everything is private now for path in paths: url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path) assert requests.get(url).status_code == 403
def test_change_acl_on_s3(s3_client, bucket): # Prepare paths paths = [ 'my/private/datasets/README.md', 'my/private/datasets/datapackage.json', 'my/private/datasets/data/mydata.csv', 'my/public/datasets/data/mydata.csv', ] # Fill the S3 bucket for path in paths: s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read') # Assert all contents are public by default for path in paths: url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path) assert requests.get(url).status_code == 200 # Set private ACL using the processor flow = Flow( load('data/data.csv'), change_acl_on_s3( bucket=bucket, acl='private', path='my/private/datasets', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process() # Assert only public contents are public for path in paths: url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path) assert requests.get(url).status_code == (200 if 'public' in path else 403)
def conference_csv(): flow = Flow( # Load inputs load( od19_base + od19_feedback, name='feedback', format='csv', ), load( od19_base + od19_analysis, name='analysis', format='csv', ), # Process them set_type("Anzahl.*", type='integer', resources='analysis'), delete_fields([ "Anzahl Auflistung", ".*\\(Formel\\)", ".*Duplikate", ], resources='analysis'), not_empty_groupcol, # Save the results add_metadata( name='opendatach19', title='''Opendata.ch/2019 Forum''', licenses=[{ "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0" }], maintainers=[{ "name": "Oleg Lavrovsky", "web": "https://datalets.ch/" }], views=[{ "name": "Groups", "resources": ["analysis"], "spec": { "group": "Alle " "Bedürfnisse" "", "series": ["Anzahl Auflistung (Zahl)"], "type": "bar" }, "specType": "simple", "title": "Topic counts" }]), printer(), validate(), dump_to_path('data/opendatach19'), ) flow.process()
def decp_processing(): flow = Flow( # Chargement du CSV suite à la conversion depuis JSON load("decp.csv"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), # Tri par rootId et seq pour préparer à la création de donneesActuelles sort_rows('{rootId}:{seq}', resources=0, reverse=True), donnees_actuelles, # rootId et seq peuvent maintenant être supprimés delete_fields(["rootId", "seq"], resources=0, regex=False), sort_rows('{datePublicationDonnees}', resources=0, reverse=True), # Nouvelle table dédiée aux marchés, sans données sur les titulaires print("Création de la table dédiée aux marchés..."), duplicate(source="decp", target_name="decp-sans-titulaires", target_path="decp-sans-titulaires.csv", duplicate_to_end=True), delete_fields([ "titulaire.id", "titulaire.denominationSociale", "titulaire.typeIdentifiant" ], resources="decp-sans-titulaires", regex=False), set_primary_key(["uid"], resources="decp-sans-titulaires"), deduplicate(), # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données # print("Téléchargement des données tabulaires précédentes..."), # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"), # set_type("acheteur.id", type="string"), # set_type("titulaire.id", type="string"), # set_type("codeCPV", type="string"), # set_type("lieuExecution.code", type="string"), # delete_fields(["rowid"], resources="previous-decp", regex=False), # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."), # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]), # Chargement des précédentes données dédiées aux titulaires print("Chargement des données titulaires..."), load("decp-titulaires.csv", name="decp-titulaires"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), set_type("departement", type="string"), set_type("codeAPE", type="string"), print("Enregistrement des données sur le disque..."), dump_to_path("decp")) flow.process()
def AFRR_Data(): unpivoting_fields = [{ 'name': 'aFRR_DownActivated', 'keys': { 'product': 'aFRR_DownActivated' } }, { 'name': 'aFRR_UpActivated', 'keys': { 'product': 'aFRR_UpActivated' } }] extra_keys = [{'name': 'product', 'type': 'string'}] extra_value = {'name': 'amount', 'type': 'number'} flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000', format="json", property="result.records", name="fact_afrr"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('afrr'), # Normalize/unpivot: unpivot(unpivoting_fields, extra_keys, extra_value), add_computed_field([ dict(target=dict(name='PriceArea', type='string'), operation='constant', with_='DK1'), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_='dummy'), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_='dummy') ]), add_price, delete_fields(fields=[ 'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK', 'aFRR_UpPriceEUR' ]), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_afrr', title='Automatic Frequency Restoration Reserves', source= 'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c' ), printer(), dump_to_path('afrr_data')) flow.process()
def test_validate(): from dataflows import Flow, validate, set_type, printer, ValidationError, exceptions def adder(row): row['a'] += 0.5 row['a'] = str(row['a']) f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'), adder, validate(), printer()) with pytest.raises(exceptions.ProcessorError) as excinfo: f.process() assert isinstance(excinfo.value.cause, ValidationError)
def test_change_acl_on_s3_handles_non_existing_keys(s3_client, bucket): # Set private ACL using the processor # Assert not failing (does nothing) flow = Flow( load('data/data.csv'), change_acl_on_s3( bucket=bucket, acl='private', path='my/non-existing/datasets', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process()
def update_dataset(): flow = Flow( # Load inputs load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), # Process them (if necessary) # Save the results add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''), printer(), dump_to_path(), ) flow.process()
def test_dump_to_sql(): from dataflows import Flow, printer, dump_to_sql from sqlalchemy import create_engine f = Flow( data, printer(), dump_to_sql(dict(output_table={'resource-name': 'res_1'}), engine='sqlite:///out/test.db')) f.process() # Check validity engine = create_engine('sqlite:///out/test.db') result = list( dict(x) for x in engine.execute('select * from output_table')) assert result == data
def test_example_5(): from dataflows import Flow, set_type, dump_to_path f = Flow(country_population(), set_type('population', type='number', groupChar=','), dump_to_path('out/country_population')) _ = f.process()
def test_example_75(): from dataflows import Flow, load, dump_to_path def add_is_guitarist_column_to_schema(package): # Add a new field to the first resource package.pkg.descriptor['resources'][0]['schema']['fields'].append(dict( name='is_guitarist', type='boolean' )) # Must yield the modified datapackage yield package.pkg yield from package def add_is_guitarist_column(row): row['is_guitarist'] = row['instrument'] == 'guitar' return row f = Flow( # Same one as above load('data/beatles.csv'), add_is_guitarist_column_to_schema, add_is_guitarist_column, dump_to_path('out/beatles_guitarists2') ) _ = f.process()
def test_example_8(): from dataflows import Flow, load, dump_to_path def find_double_winners(package): # Remove the emmies resource - we're going to consume it now package.pkg.remove_resource('emmies') # Must yield the modified datapackage yield package.pkg # Now iterate on all resources resources = iter(package) # Emmies is the first - read all its data and create a set of winner names emmy = next(resources) emmy_winners = set( map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy))) # Oscars are next - filter rows based on the emmy winner set academy = next(resources) yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners, academy) f = Flow( # Emmy award nominees and winners load('data/emmy.csv', name='emmies'), # Academy award nominees and winners load('data/academy.csv', encoding='utf8', name='oscars'), find_double_winners, dump_to_path('out/double_winners')) _ = f.process()
def test_example_7(): from dataflows import Flow, load, dump_to_path def add_is_guitarist_column(package): # Add a new field to the first resource package.pkg.descriptor['resources'][0]['schema']['fields'].append( dict(name='is_guitarist', type='boolean')) # Must yield the modified datapackage yield package.pkg # Now iterate on all resources resources = iter(package) beatles = next(resources) def f(row): row['is_guitarist'] = row['instrument'] == 'guitar' return row yield map(f, beatles) f = Flow( # Same one as above load('data/beatles.csv'), add_is_guitarist_column, dump_to_path('out/beatles_guitarists')) _ = f.process()
def main(): config = Config(sys.argv[1] if len(sys.argv) > 1 else 'dgp.yaml') taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml') context = Context(config, taxonomy_registry) dgp = SimpleDGP(config, context) ret = dgp.analyze() if not ret: print('Errors:', '\n\t - '.join([str(x) for x in dgp.errors])) sys.exit(0) flow = dgp.flow() flow = Flow(flow, dump_to_path('output')) flow.process() print('----') print('Success:', ret)
def WERKVERZEICHNIS_csv(): flow = Flow( # Load inputs # load('input/WERKVERZEICHNIS_ohne W.xlsx', format='xlsx', ), load( 'input/WERKVERZEICHNIS.csv', format='csv', ), # Process them (if necessary) # ... # Save the results add_metadata(name='Werkverzeichnis-JStraumann', title='''Werkverzeichnis Jürg Straumann'''), # printer(), dump_to_path('data'), ) flow.process()
def operator(name, params, pipeline): with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file: params['dgpConfig'].setdefault('publish', {})['allowed'] = True metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {}) metadata['title'] = name metadata['dag_id'] = pipeline['id'] metadata['updated_at'] = pipeline['__updated_at'] metadata['created_at'] = pipeline['__created_at'] for k, v in params.items(): if k.startswith('extra.'): set_dots(params['dgpConfig'], k, v) logging.info('\nCONFIGURATION:\n--------------\n%s', json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2)) yaml.dump(params['dgpConfig'], config_file) config_file.flush() config = Config(config_file.name) taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml') context = Context(config, taxonomy_registry) logging.getLogger().setLevel(logging.INFO) steps = [ FileLoaderDGP, LoaderDGP, PostLoaderDGP, TransformDGP, EnricherDGP, PublisherDGP, ] dgp = SimpleDGP( config, context, steps=steps ) ret = dgp.analyze() if not ret: logging.error('Errors:') logging.error('\n\t - '.join([str(x) for x in dgp.errors])) assert False # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', # json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2)) logging.info('Creating Flow') flow = dgp.flow() flow = Flow( flow, printer(tablefmt='html') ) logging.info('Running Flow') _, stats = flow.process() logging.info('Success') return stats
def Elspot_Prices_Data(): # field_metadata = get_metadata('c86859d2-942e-4029-aec1-32d56f1a2e5d') flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20elspotprices%20order%20by%20"HourUTC"%20desc%20limit%20100', format="json", property="result.records", name="fact_elspot_prices"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('load_data'), # Add product: add_computed_field([ dict(target=dict(name='product', type='string'), operation='constant', with_='Elspot'), dict(target=dict(name='amount', type='number'), operation='constant', with_=1), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_=-1), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_=-1) ]), add_price, delete_fields(fields=['SpotPriceDKK', 'SpotPriceEUR']), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_elspot_prices', title='Elspot Prices Data', source= 'https://www.energidataservice.dk/dataset/elspotprices/resource_extract/c86859d2-942e-4029-aec1-32d56f1a2e5d' ), printer(), dump_to_path('elspot_prices_data'), # dump_to_sql(tables={'elspot': {'resource-name': 'Elspot_Prices_Data', 'mode': 'append'}}, engine='postgresql://*****:*****@localhost/cubes') ) flow.process()
def test_dump_to_postgis(): from dataflows import Flow, load from lib import dump_to_postgis from sqlalchemy import create_engine import os import sys table_name = 'nycha_policeservice' url = 'https://data.cityofnewyork.us/api/views/bvi6-r9nk/rows.csv?accessType=DOWNLOAD' f = Flow( load(url, name=table_name, format='csv', force_strings=True), dump_to_postgis(engine='env://DATAFLOWS_DB_ENGINE') ) f.process() engine = create_engine(os.environ.get('DATAFLOWS_DB_ENGINE')) result = list(dict(x) for x in engine.execute(f'select wkb_geometry from {table_name} limit 1')) print(result) assert result == [{'wkb_geometry': '0101000020E61000001FCF149C617F52C0D7CE5B5AD9494440'}]
def test_validate(): from dataflows import Flow, validate, set_type, printer, ValidationError def adder(row): row['a'] += 0.5 row['a'] = str(row['a']) f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'), adder, validate(), printer()) try: _ = f.process() assert False except ValidationError: pass
def test_example_6(): from dataflows import Flow, set_type, dump_to_path def all_triplets(): for a in range(1, 21): for b in range(a, 21): for c in range(b + 1, 21): yield dict(a=a, b=b, c=c) def filter_pythagorean_triplets(rows): for row in rows: if row['a']**2 + row['b']**2 == row['c']**2: yield row f = Flow(all_triplets(), set_type('a', type='integer'), set_type('b', type='integer'), set_type('c', type='integer'), filter_pythagorean_triplets, dump_to_path('out/pythagorean_triplets')) _ = f.process()
def test_example_9(): from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows f = Flow( # Emmy award nominees and winners load('data/emmy.csv', name='emmies'), filter_rows(equals=[dict(winner=1)]), concatenate(dict(emmy_nominee=['nominee'], ), dict(name='emmies_filtered'), resources='emmies'), # Academy award nominees and winners load('data/academy.csv', encoding='utf8', name='oscars'), join( 'emmies_filtered', ['emmy_nominee'], # Source resource 'oscars', ['Name'], # Target resource full=False # Don't add new fields, remove unmatched rows ), filter_rows(equals=[dict(Winner='1')]), dump_to_path('out/double_winners')) _ = f.process()
set_type( 'Rate', resources='quarterly', type='number', description= 'Quarterly average yield from British Government Securities, 10 year Nominal Par Yield' ), set_type('Year', resources='annual', type='date', format='any'), set_type( 'Rate', resources='annual', type='number', description= 'Annual average yield from British Government Securities, 10 year Nominal Par Yield' ), update_resource('quarterly', **{ 'path': 'data/quarterly.csv', 'dpp:streaming': True }), update_resource('annual', **{ 'path': 'data/annual.csv', 'dpp:streaming': True }), validate(), dump_to_path()) def flow(parameters, datapackage, resources, stats): return bond_uk if __name__ == '__main__': bond_uk.process()
), load( load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUQAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963', skip_rows=[1], headers=['Date', 'Rate'], format='csv', name='quarterly' ), load( load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUAAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963', skip_rows=[1], headers=['Year', 'Rate'], format='csv', name='annual' ), set_type('Date', resources='quarterly', type='date', format='any'), set_type('Rate', resources='quarterly', type='number', description='Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'), set_type('Year', resources='annual', type='date', format='any'), set_type('Rate', resources='annual', type='number', description='Annual average yield from British Government Securities, 10 year Nominal Par Yield'), update_resource('quarterly', **{'path':'data/quarterly.csv', 'dpp:streaming': True}), update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}), validate(), dump_to_path() ) def flow(parameters, datapackage, resources, stats): return bond_uk if __name__ == '__main__': bond_uk.process()
} ], version="0.2.0", views=[ { "name": "graph", "title": "VIX - CBOE Volatility Index", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["VIX Close"]} } ], readme=readme() ), load( load_source='http://www.cboe.com/publish/ScheduledTask/MktData/datahouse/vixcurrent.csv', headers=2, name='vix-daily' ), set_type('Date', type='date', format='any'), update_resource('vix-daily', **{'title': 'VIX Daily', 'path':'data/vix-daily.csv', 'dpp:streaming': True}), validate() ) def flow(parameters, datapackage, resources, stats): return finance_vix if __name__ == '__main__': finance_vix.process()
' ', '_', 'Country', '2017', '2018', '2019', '-', 'Q417', '1Q18', '2Q18', '3Q18', '4Q18', '1Q19', '2Q19', '3Q19', '4Q19' ]), load( load_source= 'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx', format='xls', sheet=5, skip_rows=[1, 2, 3, 4, 5, 6], headers=[ ' ', 'Country', '2016', '2017', '2018', '3Q19', '4Q19', '2019', 'Change 19/18', '1Q20', '2Q20', '3Q20', '4Q20', 'Change 20/19' ]), load( load_source= 'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx', format='xls', sheet=6, skip_rows=[1, 2, 3, 4, 5, 6], headers=[ ' ', 'Country', '2017', '2018', '2019', 'Change 19/18', '2Q19', '3Q19', '4Q19', '1Q20', 'Feb20', 'Mar20', 'Change Mar/Feb' ]), rename_resources, validate(), printer(), dump_to_path('opec'), ) oil_prices.process()
{ "name": "graph", "title": "10 year US Government Bond Yields (Monthly granuarlity)", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["Rate"]} } ], readme=readme() ), load( load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn', skip_rows=[i+1 for i in range(6)], headers=['Date', 'Rate'], format='csv', name='monthly' ), set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'), set_type('Rate', type='number', description='Percent per year'), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), validate(), dump_to_path() ) def flow(parameters, datapackage, resources, stats): return bond_us if __name__ == '__main__': bond_us.process()
"with": "{Region, subregion, country or area *}" }, { "operation": "format", "target": "Country Code", "with": "{Country code}" }, { "operation": "format", "target": "Year", "with": "{year}" }, { "operation": "format", "target": "Population", "with": "{population}" } ]), delete_fields(fields=[ 'Type', 'Parent code', 'Region, subregion, country or area *', 'Country code', 'year', 'population' ], regex=False), validate(), dump_to_path() ) def flow(parameters, datapackage, resources, stats): return population_estimates if __name__ == '__main__': population_estimates.process()
skip_rows=[1, 2, 3, 4, 5, -1], headers=['Date', 'Price', 'Empty column'], format='csv', name='annual' ), extract_december_rows, load( load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its', skip_rows=[1, 2, 3, 4, 5, -1], headers=['Date', 'Price', 'Empty column'], format='csv', name='monthly' ), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}), set_type('Date', resources='annual', type='yearmonth'), set_type('Price', resources='annual', type='number'), set_type('Date', resources='monthly', type='yearmonth'), set_type('Price', resources='monthly', type='number'), validate(), delete_fields(['Empty column'], resources=None) ) def flow(parameters, datapackage, resources, stats): return gold_price_flow if __name__ == '__main__': gold_price_flow.process()