def test_add_computed_field_func(): from dataflows import add_computed_field data = [dict(x=i) for i in range(3)] f = Flow( data, add_computed_field([ dict(target=dict(name='sq', type='integer'), operation=lambda row: row['x']**2), dict(target='f', operation='format', with_='{x} - {x}') ])) results, *_ = f.results() results = list(results[0]) assert results == [ dict(x=0, sq=0, f='0 - 0'), dict(x=1, sq=1, f='1 - 1'), dict(x=2, sq=4, f='2 - 2'), ]
unpivot_fields=[{'name': '([0-9]{4})', 'keys': {'year': '\\1'}}], extra_keys=[{'name': 'year', 'type': 'year'}], extra_value={'name': 'population', 'type': 'number'}, resources=resource_names[1:] ), add_computed_field([ { "operation": "format", "target": "Region", "with": "{Region, subregion, country or area *}" }, { "operation": "format", "target": "Country Code", "with": "{Country code}" }, { "operation": "format", "target": "Year", "with": "{year}" }, { "operation": "format", "target": "Population", "with": "{population}" } ]), delete_fields(fields=[ 'Type', 'Parent code', 'Region, subregion, country or area *', 'Country code', 'year', 'population' ], regex=False), validate(), dump_to_path()
def flow(parameters, *args): return Flow( add_computed_field( target=dict(name='date', type='date'), operation=lambda row: datetime.strftime(date, '%Y-%m-%d'), resources=parameters["resources"]))
set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_19-covid-Confirmed', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Confirmed={ 'name': 'Case', 'aggregate': 'first' })), join(source_name='time_series_19-covid-Recovered', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' })), add_computed_field(target={ 'name': 'Deaths', 'type': 'number' }, operation='format', with_='{Case}'), delete_fields(['Case']), update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'), dump_to_path()).results()[0]
fields=dict(Confirmed={ 'name': 'Case', 'aggregate': 'first' })), join(source_name='time_series_19-covid-Recovered', source_key=['Province/State', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' })), add_computed_field(target={ 'name': 'Deaths', 'type': 'number' }, operation='format', with_='{Case}'), add_computed_field(target={ 'name': 'Country', 'type': 'string' }, operation='format', with_='{Country/Region}'), add_computed_field(target={ 'name': 'Province', 'type': 'string' }, operation='format', with_='{Province/State}'), delete_fields(['Case', 'Country/Region', 'Province/State']),
def add_fields(names, type): return add_computed_field([ dict(target=name, type=type, operation=(lambda row: None)) for name in names ])
target_name="time_series_covid19_deaths_global", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Recovered={ "name": "Case", "aggregate": "first" }), mode="full-outer", ), # Add missing columns, e.g., after 'full-outer' join, the rows structure # is inconsistent fix_canada_recovered_data, add_computed_field( target={ "name": "Deaths", "type": "number" }, operation="format", with_="{Case}", resources=["time_series_covid19_deaths_global"], ), delete_fields(["Case"], resources=["time_series_covid19_deaths_global"]), update_resource( "time_series_covid19_deaths_global", name="time-series-19-covid-combined", path="data/time-series-19-covid-combined.csv", ), update_resource( "time_series_covid19_confirmed_US", name="us_confirmed", path="data/us_confirmed.csv", ),
def main_flow(prefix=''): source_url = '{}data/publications_for_es/datapackage.json'.format(prefix) package = Package(source_url) all_fields = set(field.name for resource in package.resources for field in resource.schema.fields) all_fields = dict((field_name, []) for field_name in all_fields) return Flow( load(source_url), lambda row: dict(row, json='{}'), concatenate(all_fields, target=dict(name='publications', path='publications.csv')), delete_fields(['json']), prefer_gd('title'), prefer_gd('notes'), prefer_gd('publisher'), prefer_gd('tags'), prefer_gd('language_code'), prefer_gd('pubyear'), split_keyword_list('item_kind', 'gd_Item Type'), split_keyword_list('life_areas', 'gd_Life Domains'), split_keyword_list('source_kind', 'gd_Resource Type'), split_keyword_list('languages', 'language_code', ' '), split_keyword_list('tags', 'tags'), load('data/zotero/datapackage.json'), concatenate(dict( title=[], pubyear=[], publisher=[], authors=[], life_areas=[], notes=[], languages=[], tags=[], url=[], migdar_id=[], item_kind=[], source_kind=[], isbn=[], physical_description=[], publication_distribution_details=[], doc_id=[], ), target=dict(name='publications', path='publications.csv')), set_type('title', **{'es:title': True}), set_type('notes', **{'es:hebrew': True}), set_type('publisher', **{'es:keyword': True}), add_field('year', 'integer', default=extract_year), split_and_translate('tags', 'tags', keyword=True), split_and_translate('life_areas', 'life_areas', keyword=True), split_and_translate('languages', 'languages', keyword=True), split_and_translate('source_kind', 'source_kind', keyword=True), split_and_translate('item_kind', 'item_kind', keyword=True), printer(), add_computed_field([ { 'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN }, { 'operation': 'format', 'target': 'page_title', 'with': PAGE_TITLE_PATTERN }, ]), add_computed_field([]), )
"path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", 'name': "open_data_commons_public_domain_dedication_and_license_v1.0" }], sources=[{ "name": "Our Airports", "path": "http://ourairports.com/data/", "title": "Our Airports" }], readme=readme()), add_computed_field([{ "operation": "format", "target": "coordinates", "with": "{latitude_deg}, {longitude_deg}" }]), delete_fields(fields=[ "id", "longitude_deg", "latitude_deg", "scheduled_service", "home_link", "wikipedia_link", "keywords" ]), update_resource('airport-codes', **{'path': 'data/airport-codes.csv'}), validate(), dump_to_path()) def flow(parameters, datapackage, resources, stats): return dialing_info_cldr if __name__ == '__main__': dialing_info_cldr.process()
def flow(parameters): return Flow( add_computed_field(parameters.get('fields', []), parameters.get('resources')), )
def base_flow(): sources, *_ = Flow( list_gdrive(), filter_rows(lambda row: ( row['kind'] == 'drive#file' and row['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' )), add_field('filename', 'string', default=lambda row: 'pubfiles/{modifiedTime}-{id}.xlsx'.format(**row)), download_files(), add_field('sheet', 'string'), add_field('headers', 'integer', 1), get_sheets(), ).results() return Flow( *[ load(source['filename'], sheet=source['sheet'], headers=source['headers'], infer_strategy=load.INFER_STRINGS, cast_strategy=load.CAST_TO_STRINGS, name=source['filename']) for source in sources[0] ], filter_rows(lambda row: row.get('migdar_id') not in ('', 'None', None)), load('data/zotero/zotero.csv'), concatenate( fields={ 'migdar_id': [], 'title': ['Title', ], 'bib_title': [], 'bib_related_parts': [], 'notes': [], 'tags': ['Tags'], 'publisher': [], 'languages': ['language_code'], 'item_kind': ['Item Type', 'Item type', 'item_type'], 'pubyear': ['pubyear/pubdate'], 'life_areas': ['Life Domains', 'Domain'], 'source_kind': ['Resource Type', 'Resource type'], 'authors': ['author'], 'url': ['URL'], }, target=dict( name='publications', path='data/publications.csv' ) ), fix_nones(), fix_urls(['url']), set_type('title', **{'es:title': True}), set_type('authors', **{'es:boost': True}), set_type('notes', **{'es:hebrew': True}), set_type('publisher', **{'es:boost': True}), add_field('year', 'integer', default=extract_year), split_and_translate('tags', 'tags', keyword=True, delimiter=','), split_and_translate('life_areas', 'life_areas', keyword=True, delimiter=','), split_and_translate('languages', 'languages', keyword=True, delimiter=' '), split_and_translate('source_kind', 'source_kind', keyword=True), split_and_translate('item_kind', 'item_kind', keyword=True), fix_links('notes'), verify_migdar_id(), add_computed_field([ {'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN}, {'operation': 'format', 'target': 'page_title', 'with': PAGE_TITLE_PATTERN}, ]), add_field('title_kw', 'string', default=lambda row: row.get('title'), **{'es:keyword': True}), )
def flow(self): taxonomy = self.context.taxonomy txn_config = taxonomy.config fmt_str = [taxonomy.title + ' for:'] fields = txn_config['key-fields'] for f in fields: for ct in taxonomy.column_types: if ct['name'] == f: fmt_str.append( '%s: "{%s}",' % (ct['title'], f.replace(':', '-')) ) break fmt_str = ' '.join(fmt_str) fields = [ ct.replace(':', '-') for ct in fields ] all_fields = ['_source'] + fields TARGET = 'configurations' saved_config = self.config._unflatten() saved_config.setdefault('publish', {})['allowed'] = False return Flow( duplicate(RESOURCE_NAME, TARGET), join_with_self( TARGET, all_fields, dict((f, {}) for f in all_fields), ), add_computed_field( [ dict( operation='format', target='snippets', with_=fmt_str ), dict( operation='constant', target='key_values', with_=None ), ], resources=TARGET ), add_field('config', 'object', saved_config, resources=TARGET), add_field('fields', type='object', default=self.collate_values(fields), resources=TARGET), join_with_self( TARGET, ['_source'], dict( source=dict(name='_source'), config={}, key_values=dict(aggregate='array'), snippets=dict(aggregate='array'), ) ), set_type('source', type='string'), set_type('config', type='object'), set_type('key_values', type='array'), set_type('snippets', type='array'), set_primary_key(['source']), dump_to_sql( dict([ (TARGET, { 'resource-name': TARGET, 'mode': 'update' }) ]), engine=self.lazy_engine(), ), )
def data_pull_csv(): unpivoting_fields = [{ "name": r"([0-9]+\/[0-9]+\/[0-9]+)", "keys": { "Date": r"\1" } }] extra_keys = [{"name": "Date", "type": "string"}] extra_value = {"name": "Case", "type": "number"} Flow( load(f"{BASE_URL}{CONFIRMED}"), load(f"{BASE_URL}{RECOVERED}"), load(f"{BASE_URL}{DEATH}"), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ "name": "Date", "patterns": [{ "find": "/", "replace": "-" }] }]), to_normal_date, set_type("Date", type="date", format="%d-%m-%y", resources=None), set_type("Case", type="number", resources=None), join( source_name="time_series_19-covid-Confirmed", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_19-covid-Deaths", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Confirmed={ "name": "Case", "aggregate": "first" }), ), join( source_name="time_series_19-covid-Recovered", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_19-covid-Deaths", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Recovered={ "name": "Case", "aggregate": "first" }), ), add_computed_field( target={ "name": "Deaths", "type": "number" }, operation="format", with_="{Case}", ), delete_fields(["Case"]), update_resource( "time_series_19-covid-Deaths", name="time-series-19-covid-combined", path=RAW_OUTPUT_CSV, ), dump_to_path(), ).results()[0]
def Olap_Datapackage(): flow = Flow( # Load datapackages: load('elspot_prices_data/datapackage.json'), load('afrr_data/datapackage.json'), load('fcr_dk1_data/datapackage.json'), concatenate(fields={ 'Timestamp': ['HourUTC'], 'Area': ['PriceArea'], 'Product': ['product'], 'Amount': ['amount'], 'Price_DKK': ['PriceDKK'], 'Price_EUR': ['PriceEUR'] }, target={ 'name': 'fact', 'path': 'data/fact.csv' }), add_computed_field( [dict(target='id', operation='constant', with_='dummy')]), add_id, set_type('id', type='integer'), set_primary_key(primary_key=['id']), # Reorder so that 'id' column is the first: select_fields([ 'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK', 'Price_EUR' ], resources='fact'), # Add foreign keys: add_foreign_keys, # Fact table is ready. Now duplicate the resource to generate dim tables: # First is 'time' table: duplicate(source='fact', target_name='time', target_path='time.csv'), select_fields(['Timestamp'], resources=['time']), join_self(source_name='time', source_key=['Timestamp'], target_name='time', fields={'Timestamp': {}}), # Parse datetime fields and add a separate field for year, month and day: add_computed_field([ dict(target=dict(name='day', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d' )), dict(target=dict(name='month', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m' )), dict(target=dict(name='month_name', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B' )), dict(target=dict(name='year', type='year'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y' )), ], resources=['time']), set_primary_key(primary_key=['Timestamp'], resources=['time']), # Now 'area' table: duplicate(source='fact', target_name='area', target_path='area.csv'), select_fields(['Area'], resources=['area']), join_self(source_name='area', source_key=['Area'], target_name='area', fields={'Area': {}}), set_primary_key(primary_key=['Area'], resources=['area']), # Now 'product' table: duplicate(source='fact', target_name='product', target_path='product.csv'), select_fields(['Product'], resources=['product']), join_self(source_name='product', source_key=['Product'], target_name='product', fields={'Product': {}}), set_primary_key(primary_key=['Product'], resources=['product']), dump_to_path('olap_datapackage')) flow.process()
source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_covid19_deaths_global', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' }), mode='full-outer'), # Add missing columns, e.g., after 'full-outer' join, the rows structure # is inconsistent fix_canada_recovered_data, add_computed_field(target={ 'name': 'Deaths', 'type': 'number' }, operation='format', with_='{Case}', resources=['time_series_covid19_deaths_global']), delete_fields(['Case'], resources=['time_series_covid19_deaths_global']), delete_fields( ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key'], resources=[ 'time_series_covid19_confirmed_US', 'time_series_covid19_deaths_US' ]), update_resource('time_series_covid19_deaths_global', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'), update_resource('time_series_covid19_confirmed_US', name='us_confirmed', path='data/us_confirmed.csv'),
DF.delete_fields(['alt_name[1-5]']), *[ split_and_translate( f, f, delimiter=',', keyword=f in ('org_kind', 'life_areas', 'languages', 'tags', 'compact_services') ) for f in ('languages', 'life_areas', 'tags', 'regions', 'org_kind', 'specialties', 'provided_services', 'target_audiences', 'compact_services') ], DF.add_field('title_kw', 'string', default=lambda row: row.get('org_name'), **{'es:keyword': True}), DF.add_computed_field( target='doc_id', operation='format', with_='org/{entity_id}' ), fix_doc_id, fix_links('objective'), fix_links('objective__en'), fix_links('objective__ar'), DF.add_field('year', 'integer', default=cur_year), DF.set_type('org_name', **{'es:title': True}), DF.set_type('org_name__ar', **{'es:title': True}), DF.set_type('alt_names', **{'es:itemType': 'string', 'es:title': True}), *[ DF.set_type(f, **{'es:index': False}) for f in [ 'org_website', 'org_facebook', 'org_phone_number',
set_defaults, extrapulate_years, fix_values, DF.set_type('value', groupChar=',', bareNumber=True), fix_units, DF.set_type('extrapulation_years', type='array', **{'es:itemType': 'string'}), DF.validate(), DF.add_computed_field([ dict(target=dict(name='life_areas', type='array', **{ 'es:itemType': 'string', 'es:keyword': True }), operation=lambda row: [ x for x in [row.get('life_area{}'.format(i)) for i in range(1, 4)] if x is not None ]) ]), DF.delete_fields(['life_area{}'.format(i) for i in range(1, 4)]), DF.join_self( 'out', ['chart_title', 'series_title'], 'out', dict([(k, None) for k in CHART_FIELDS + SERIES_FIELDS] + [(k, dict(aggregate='array')) for k in ['year', 'value']] + [('max_year', dict(name='year', aggregate='max'))])), verify_percents, DF.add_computed_field([ dict(target=dict(name='dataset', type='array'),