def join_unique_records(*args): is_dpp = len(args) > 3 return Flow( load('data/search_import_from_gdrive/datapackage.json', resources=['search_import']), load('data/search_results/unique_records.csv', resources=['unique_records']), set_type('migdar_id', type='string', resources=['unique_records', 'search_import']), join(source_name='search_import', source_key=['migdar_id'], target_name='unique_records', target_key=['migdar_id'], fields={ f'gd_{field}': { 'name': field } for field in SEARCH_IMPORT_FIELD_NAMES }, full=False), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['migdar_id']), dump_to_path('data/unique_records_full'), update_resource(None, **{'dpp:streaming': True}))
def judges_flow(out_path): return Flow( get_tribunals(), update_resource(['res_1'], name='tribunals', path='tribunals.csv'), checkpoint('judges_tribunals'), get_judges(), update_resource(['res_2'], name='judges_list', path='judges_list.csv'), set_type('Is_In_Dimus_List', resources=['judges_list'], type='boolean'), checkpoint('judges_judges_list'), join('tribunals', ['Tribunal_Code'], 'judges_list', ['Tribunal_Code'], fields={ 'Tribunal_Type_Code': {}, 'Tribunal_Arkaa_Code': { 'name': 'Arkaa_Code' }, 'Tribunal_District_Code': { 'name': 'District_Code' }, 'Tribunal_Name': { 'name': 'Name' } }), fetch_judges_details, checkpoint('judges_details'), add_field('tribunal_type_name', 'string'), parse_judges_extra_details, checkpoint('judges_extra_details'), parse_judge_events, dump_to_path(out_path), printer(num_rows=1))
def flow(parameters): source = parameters['source'] target = parameters['target'] return Flow( load_lazy_json(source['name']), join(source['name'], source['key'], target['name'], target['key'], parameters['fields'], parameters.get('full', True), source.get('delete', False)), update_resource(target['name'], **{PROP_STREAMING: True}))
def test_fullouter_join_dump_different_keys(): from dataflows import Flow, join, dump_to_path from decimal import Decimal data1 = [ { "col1": 1.531, "col2": "hello" }, { "col1": 1.132, "col2": "goodbye" }, ] data2 = [ { "colA": 1.531, "colB": "123" }, { "colA": 1.132, "colB": 1.132 }, ] f = Flow( data1, data2, join("res_1", ["col1"], "res_2", ["colA"], {"col2": { "name": "col2", "aggregate": "first" }}, mode="full-outer"), dump_to_path(out_path='out/test_join_dump'), ) results = f.results()[0][0] assert results == [ { 'colA': Decimal('1.531'), 'col2': 'hello', 'colB': '123' }, { 'colA': Decimal('1.132'), 'col2': 'goodbye', 'colB': 1.132 }, ]
def conditional(self): target_field_names = [ ct.replace(':', '-') for ct in self.TARGET_FIELD_COLUMNTYPES ] steps = [ join( self.key, self.REF_KEY_FIELDS, RESOURCE_NAME, self.SOURCE_KEY_FIELDS, dict((target_field_name, dict(name=fetch_field)) for target_field_name, fetch_field in zip( target_field_names, self.REF_FETCH_FIELDS))), ] steps.extend([ set_type(target_field_name, resources=RESOURCE_NAME, columnType=target_field_columntype) for target_field_name, target_field_columntype in zip( target_field_names, self.TARGET_FIELD_COLUMNTYPES) ]) f = Flow(*steps) return f
def test_join_full_outer(): from dataflows import load, set_type, join flow = Flow( load('data/population.csv'), load('data/cities.csv'), join( source_name='population', source_key=['id'], target_name='cities', target_key=['id'], fields={'population': { 'name': 'population' }}, mode='full-outer', ), ) data = flow.results()[0] assert data == [[ { 'id': 1, 'city': 'london', 'population': 8 }, { 'id': 2, 'city': 'paris', 'population': 2 }, { 'id': 3, 'city': 'rome', 'population': None }, { 'id': 4, 'city': None, 'population': 3 }, ]]
def test_example_9(): from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows f = Flow( # Emmy award nominees and winners load('data/emmy.csv', name='emmies'), filter_rows(equals=[dict(winner=1)]), concatenate(dict(emmy_nominee=['nominee'], ), dict(name='emmies_filtered'), resources='emmies'), # Academy award nominees and winners load('data/academy.csv', encoding='utf8', name='oscars'), join( 'emmies_filtered', ['emmy_nominee'], # Source resource 'oscars', ['Name'], # Target resource full=False # Don't add new fields, remove unmatched rows ), filter_rows(equals=[dict(Winner='1')]), dump_to_path('out/double_winners')) _ = f.process()
def join_search_app_records(*args): is_dpp = len(args) > 3 def load_search_app_data(): with Stream( 'https://migdar-internal-search.odata.org.il/__data/search_app/index.csv', http_session=get_migdar_session()) as index_stream: for i, row in enumerate(index_stream.iter()): search_id = row[4] print(f"#{i}. {search_id} ({row[0]}/{row[1]})") url = f'https://migdar-internal-search.odata.org.il/__data/search_app/{search_id}/records.csv' with Stream(url, headers=1, http_session=get_migdar_session()) as data_stream: for rownum, row in enumerate(data_stream.iter(keyed=True)): row['migdar_id'] = f'{search_id}-{rownum}' yield row return Flow( load('data/search_import_from_gdrive/datapackage.json', resources=['search_import']), load_search_app_data(), update_resource('res_2', name='search_app_records', path='search_app_records.csv'), join(source_name='search_import', source_key=['migdar_id'], target_name='search_app_records', target_key=['migdar_id'], fields={ f'gd_{field}': { 'name': field } for field in SEARCH_IMPORT_FIELD_NAMES }, full=False), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['migdar_id']), dump_to_path('data/app_records_full'), update_resource(None, **{'dpp:streaming': True}))
def test_join(): from dataflows import Flow, join, join_with_self, set_type, sort_rows from decimal import Decimal characters = [ { 'first_name': 'Jaime', 'house': 'Lannister', 'last_name': 'Lannister', 'age': 34 }, { 'first_name': 'Tyrion', 'house': 'Lannister', 'last_name': 'Lannister', 'age': 27 }, { 'first_name': 'Cersei', 'house': 'Lannister', 'last_name': 'Lannister', 'age': 34 }, { 'first_name': 'Jon', 'house': 'Stark', 'last_name': 'Snow', 'age': 17 }, { 'first_name': 'Sansa', 'house': 'Stark', 'last_name': 'Stark', 'age': 14 }, { 'first_name': 'Rickon', 'house': 'Stark', 'last_name': 'Stark', 'age': 5 }, { 'first_name': 'Arya', 'house': 'Stark', 'last_name': 'Stark', 'age': 11 }, { 'first_name': 'Bran', 'house': 'Stark', 'last_name': 'Stark', 'age': 10 }, { 'first_name': 'Daenerys', 'house': 'Targaryen', 'last_name': 'Targaryen', 'age': 16 }, ] houses = [ { 'house': 'House of Lannister' }, { 'house': 'House of Greyjoy' }, { 'house': 'House of Stark' }, { 'house': 'House of Targaryen' }, { 'house': 'House of Martell' }, { 'house': 'House of Tyrell' }, ] res, _, _ = Flow( characters, set_type('age', type='number'), houses, join('res_1', 'House of {house}', 'res_2', '{house}', dict(max_age={ 'name': 'age', 'aggregate': 'max' }, avg_age={ 'name': 'age', 'aggregate': 'avg' }, representative={ 'name': 'first_name', 'aggregate': 'last' }, representative_age={'name': 'age'}, number_of_characters={'aggregate': 'count'}, last_names={ 'name': 'last_name', 'aggregate': 'counters' }), full=False, source_delete=True)).results() assert res[0] == [ { 'avg_age': Decimal('31.66666666666666666666666667'), 'house': 'House of Lannister', 'max_age': Decimal(34), 'number_of_characters': 3, 'representative': 'Cersei', 'representative_age': Decimal(34), 'last_names': [('Lannister', 3)] }, { 'avg_age': Decimal('11.4'), 'house': 'House of Stark', 'max_age': Decimal(17), 'number_of_characters': 5, 'representative': 'Bran', 'representative_age': Decimal(10), 'last_names': [('Stark', 4), ('Snow', 1)] }, { 'avg_age': Decimal(16), 'house': 'House of Targaryen', 'max_age': Decimal(16), 'number_of_characters': 1, 'representative': 'Daenerys', 'representative_age': Decimal(16), 'last_names': [('Targaryen', 1)] }, ] # Find youngest of each house res, _, _ = Flow( characters, set_type('age', type='number'), sort_rows('{age:02}'), join_with_self('res_1', '{house}', { 'the_house': { 'name': 'house' }, '*': { 'aggregate': 'first' }, }), sort_rows('{the_house}')).results() assert res[0] == [{ 'the_house': 'Lannister', 'first_name': 'Tyrion', 'last_name': 'Lannister', 'age': Decimal('27') }, { 'the_house': 'Stark', 'first_name': 'Rickon', 'last_name': 'Stark', 'age': Decimal('5') }, { 'the_house': 'Targaryen', 'first_name': 'Daenerys', 'last_name': 'Targaryen', 'age': Decimal('16') }]
load(f'{BASE_URL}{DEATH}'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ 'name': 'Date', 'patterns': [{ 'find': '/', 'replace': '-' }] }]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_19-covid-Confirmed', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Confirmed={ 'name': 'Case', 'aggregate': 'first' })), join(source_name='time_series_19-covid-Recovered', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' })), add_computed_field(target={ 'name': 'Deaths',
find_replace([{ "name": "Date", "patterns": [{ "find": "/", "replace": "-" }] }]), to_normal_date, set_type("Date", type="date", format="%d-%m-%y", resources=None), set_type("Case", type="number", resources=None), join( source_name="time_series_covid19_confirmed_global", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_covid19_deaths_global", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Confirmed={ "name": "Case", "aggregate": "first" }), ), join( source_name="time_series_covid19_recovered_global", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_covid19_deaths_global", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Recovered={ "name": "Case", "aggregate": "first" }),
def process_stack_demand(stack): def collect_cats(): F = 'כלל המדגם' def f(rows): cat = None for row in rows: if F in row: v = row[F] if v.startswith('סך הכל '): cat = v[7:] elif v.startswith('--- '): if not v.endswith('ללא פירוט'): subcat = v[4:] row['category'] = cat row['subcategory'] = subcat yield row else: yield row return DF.Flow( DF.add_field('category', 'string', resources=-1), DF.add_field('subcategory', 'string', resources=-1), f, DF.delete_fields([F], resources=-1), ) def fix_nones(row): row['demand_pct'] = row['demand_pct'] or 0 key = 'stack:demand' try: demand_stacks = _cache.get(key) except KeyError: demand_stacks = DF.Flow( DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2), collect_cats(), DF.update_schema(-1, missingValues=['--']), DF.unpivot( unpivot_fields=[dict( name='(.+) \\([A-Z]\\)', keys=dict( neighborhood='\\1' ), )], extra_keys=[dict( name='neighborhood', type='string' )], extra_value=dict( name='demand_pct', type='number' ), resources=-1 ), DF.validate(), DF.duplicate('demand', 'demand_stacks'), DF.join_with_self('demand', ['category', 'subcategory'], dict( category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max') )), DF.join( 'demand', ['category', 'subcategory'], 'demand_stacks', ['category', 'subcategory'], dict( max_demand=None ) ), fix_nones, DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)), DF.add_field('value', 'number', lambda r: r['demand_pct']), DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6), DF.delete_fields(['demand_pct', 'max_demand']), DF.sort_rows('{score}', reverse=True), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_display=r['display'], score_value=float(r['value']), geometry_score=float(r['score']), )), DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict( category=None, subcategory=None, scores=dict(aggregate='array'), )), DF.add_field('card', 'object', lambda r: dict( title='ביקוש ל{}'.format(r['subcategory']), content='', scores=r['scores'], test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_') )), DF.join_with_self('demand_stacks', ['category'], dict( category=None, cards=dict(name='card', aggregate='array'), )), DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')), ).results()[0][0] _cache.set(key, demand_stacks) cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards'] stack.update(dict( layout='scores', currentField='neighborhood', map=True )) stack.setdefault('cards', []).extend(cards)
def process_demographics(stack): key = 'stack:demographics' try: demographics_cards = _cache.get(key) except KeyError: def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f ) def map_to_cards(): MAP = { ("דו''ח אג''ס לפי עולים וותיקים", ("סה''כ עולים",) ): 'immigrants', ("דו''ח אג''ס לפי קבוצות גיל", ('0-5', '6-12') ): 'kids', ("דו''ח אג''ס לפי קבוצות גיל", ('13-17',) ): 'teenagers', ("דו''ח אג''ס לפי קבוצות גיל", ('60-64', '65-69', '70-74', '75-120') ): 'elderly', ("דו''ח אג''ס לפי קבוצות גיל", ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59') ): 'adults', } def f(rows): for row in rows: for (source, kinds), kind in MAP.items(): if row['source'] == source and row['kind'] in kinds: row['kind'] = kind yield row return f s2n = dict( (int(stat_area), f['properties']['title']) for f in get_neighborhood_features() for stat_area in f['properties']['stat_areas'] ) MAP2 = dict( adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0), kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1), teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2), elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3), immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4), ) demographics_cards = DF.Flow( *[ DF.load(f, headers=4) for f in glob.glob('demographics/*.csv') ], DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]), DF.add_field('total', 'number', lambda r: r.get("סה''כ")), DF.delete_fields(["אג''ס", "סה''כ "]), DF.unpivot([dict( name="([-'א-ת0-9 ].+)", keys=dict( kind=r'\1' ) )], [dict( name='kind', type='string' )], dict( name='value', type='number' )), DF.validate(), add_source(), map_to_cards(), DF.concatenate(dict( total=[], value=[], kind=[], stat_id=[] )), DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))), DF.filter_rows(lambda r: r['neighborhood']), DF.join_with_self('concat', ['neighborhood', 'kind'], dict( neighborhood=None, kind=None, total=dict(aggregate='sum'), value=dict(aggregate='sum'), )), DF.duplicate('concat', 'maxes'), DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)), DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict( total=None, )), DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total'] DF.sort_rows('{score_value}', reverse=True), DF.duplicate('maxes', 'demographics'), DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))), DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)), DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']), DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_value=float(r['score_value']), score_display=r['score_display'], geometry_score=float(r['geometry_score']), )), DF.join_with_self('demographics', ['kind'], dict( kind=None, scores=dict(aggregate='array'), )), DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]), DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]), DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]), DF.sort_rows('{order}'), DF.delete_fields(['kind']), ).results()[0][0] _cache.set(key, demographics_cards) # features = [ # dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0])) # for r in DF.Flow( # DF.load('geo/stat-areas/stat-areas/datapackage.json'), # ).results()[0][0] # ] # geometry=dict(type='FeatureCollection', features=features) stack.update(dict( map=True, scheme='green', currentField='neighborhood', layout='scores', # geometry=geometry )) stack.setdefault('cards', []).extend(demographics_cards)
neighborhood_fill_color_cases = ['case'] neighborhood_fill_pattern_cases = ['case'] r, _, _ = DF.Flow( DF.load(latest_file(), name='cities', override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.load(latest_file(), name='out', override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.add_field('city_area_id', 'string', lambda r: r['area_id'].split('-')[0]), DF.join('cities', ['city_area_id'], 'out', ['city_area_id'], dict(num_city_reports=dict(name='num_reports_weighted'))), DF.add_field('desc', 'string', ''), DF.add_field('kind', 'string', ''), DF.add_field('property', 'string', ''), props(), DF.join_with_self( 'out', ['is_city', 'kind', 'desc', 'property'], dict(is_city=None, kind=None, desc=None, property=None, id=dict(aggregate='array'))), ).results() for item in r[0]: print('bucket for {} {} {}: {}'.format(
def data_pull_csv(): unpivoting_fields = [{ "name": r"([0-9]+\/[0-9]+\/[0-9]+)", "keys": { "Date": r"\1" } }] extra_keys = [{"name": "Date", "type": "string"}] extra_value = {"name": "Case", "type": "number"} Flow( load(f"{BASE_URL}{CONFIRMED}"), load(f"{BASE_URL}{RECOVERED}"), load(f"{BASE_URL}{DEATH}"), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ "name": "Date", "patterns": [{ "find": "/", "replace": "-" }] }]), to_normal_date, set_type("Date", type="date", format="%d-%m-%y", resources=None), set_type("Case", type="number", resources=None), join( source_name="time_series_19-covid-Confirmed", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_19-covid-Deaths", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Confirmed={ "name": "Case", "aggregate": "first" }), ), join( source_name="time_series_19-covid-Recovered", source_key=["Province/State", "Country/Region", "Date"], source_delete=True, target_name="time_series_19-covid-Deaths", target_key=["Province/State", "Country/Region", "Date"], fields=dict(Recovered={ "name": "Case", "aggregate": "first" }), ), add_computed_field( target={ "name": "Deaths", "type": "number" }, operation="format", with_="{Case}", ), delete_fields(["Case"]), update_resource( "time_series_19-covid-Deaths", name="time-series-19-covid-combined", path=RAW_OUTPUT_CSV, ), dump_to_path(), ).results()[0]
unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ 'name': 'Date', 'patterns': [{ 'find': '/', 'replace': '-' }] }]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_covid19_confirmed_global', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_covid19_deaths_global', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Confirmed={ 'name': 'Case', 'aggregate': 'first' })), join(source_name='time_series_covid19_recovered_global', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_covid19_deaths_global', target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' }), mode='full-outer'), # Add missing columns, e.g., after 'full-outer' join, the rows structure
import dataflows as DF import glob x = DF.Flow( ({'filename': x[:-4]} for x in glob.glob('*png')), DF.update_resource(-1, name='files'), DF.load('http://api.yodaat.org/data/orgs_in_es/data/orgs.csv', name='orgs'), DF.join( 'files', '{filename}', 'orgs', '{entity_id}', { 'filename': {}, }, full=True, source_delete=True ), DF.filter_rows(lambda row: row['filename'] is None), DF.select_fields(['org_name', 'entity_id']), DF.printer() ).process()