def test_filter_rows(): from dataflows import filter_rows f = Flow( [ { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, { 'a': 1, 'b': 4 }, { 'a': 2, 'b': 4 }, ], filter_rows(equals=[dict(a=1)]), filter_rows(not_equals=[dict(b=3)]), ) results, _, _ = f.results() assert results[0][0] == dict(a=1, b=4) assert len(results[0]) == 1 assert len(results) == 1
def broken_links_flow(): return DF.Flow( *[ DF.Flow( DF.load(URL_TEMPLATE.format(**c), name=c['name']), DF.add_field('__name', 'string', c['name'], resources=c['name']), DF.add_field('__title', 'string', get_title(c['title']), resources=c['name']), ) for c in configuration ], DF.add_field('urls', 'array', lambda r: RE.findall(str(r))), DF.add_field('link', 'string', lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)), DF.concatenate( dict( name=['__name'], title=['__title'], link=[], urls=[], )), DF.add_field('url', 'string'), DF.add_field('error', 'string'), unwind(), DF.delete_fields(['urls']), DF.parallelize(check_broken(), 4), DF.filter_rows(lambda r: r['error'] is not None), )
def flow(parameters): return Flow( filter_rows( equals=parameters.get('in', []), not_equals=parameters.get('out', []), resources=parameters.get('resources'), ))
def flow(*_): DF.Flow( DF.load(filename, name='welfare'), DF.add_field('activity_name', 'string', lambda r: r['שם השירות (ציבורי)']), DF.filter_rows(lambda r: r['activity_name']), DF.add_field( 'activity_description', 'array', lambda r: [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)'] ]), DF.add_field( 'history', 'array', lambda r: [ dict( year=2019, unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(), subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1]. strip(), subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[ 1].strip(), ) ]), DF.add_field('target_audience', 'array', splitter('אוכלוסייה')), DF.add_field('subject', 'array', splitter('תחום ההתערבות')), DF.add_field('intervention', 'array', splitter('אופן התערבות')), DF.select_fields(FIELDS), DF.add_field('publisher_name', 'string', 'משרד הרווחה'), DF.add_field('min_year', 'integer', 2019), DF.add_field('max_year', 'integer', 2019), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(), DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process() return DF.Flow( DF.load('tmp/activities-welfare/datapackage.json'), DF.update_resource(-1, **{'dpp:streaming': True}), )
def flow(*args): is_dpp = len(args) > 3 return Flow( load('data/unique_records_full/datapackage.json', resources=['unique_records']), load('data/app_records_full/datapackage.json', resources=['search_app_records']), add_field('__revision', 'integer', REVISION), *(add_field(f['name'], f['type']) for f in STATUS_FIELDS), manage_revisions, *(dump_to_sql( { DB_TABLE: { 'resource-name': resource_name, 'mode': 'update', 'update_keys': KEY_FIELDS } }, DATAFLOWS_DB_ENGINE) for resource_name in ['unique_records', 'search_app_records']), *(add_field(f'rev_{name}', 'date') for name in ['last_updated_at', 'last_modified_at', 'created_at']), set_revisions, filter_rows(equals=[{ '__next_update_days': FILTER_NEXT_UPDATE_DAYS }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(), dump_to_path('data/publications_for_es'), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['doc_id']), update_resource(None, **{'dpp:streaming': True}))
def test_filter_rows_callable(): from dataflows import filter_rows f = Flow( [ { 'a': 1, 'b': 3 }, { 'a': 2, 'b': 3 }, { 'a': 1, 'b': 4 }, { 'a': 2, 'b': 4 }, ], filter_rows(condition=lambda row: row['a'] > 1 and row['b'] < 4), ) results, _, _ = f.results() assert results[0][0] == dict(a=2, b=3) assert len(results[0]) == 1 assert len(results) == 1
def test_example_9(): from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows f = Flow( # Emmy award nominees and winners load('data/emmy.csv', name='emmies'), filter_rows(equals=[dict(winner=1)]), concatenate(dict(emmy_nominee=['nominee'], ), dict(name='emmies_filtered'), resources='emmies'), # Academy award nominees and winners load('data/academy.csv', encoding='utf8', name='oscars'), join( 'emmies_filtered', ['emmy_nominee'], # Source resource 'oscars', ['Name'], # Target resource full=False # Don't add new fields, remove unmatched rows ), filter_rows(equals=[dict(Winner='1')]), dump_to_path('out/double_winners')) _ = f.process()
def flow(*_): return DF.Flow( all_units(), DF.add_field('office', 'string', lambda r: r['path'][0] if len(r['path']) > 0 else None, **{'es:keyword': True}), DF.add_field('unit', 'string', lambda r: r['path'][1] if len(r['path']) > 1 else None, **{'es:keyword': True}), DF.add_field('subunit', 'string', lambda r: r['path'][2] if len(r['path']) > 2 else None, **{'es:keyword': True}), DF.add_field('subsubunit', 'string', lambda r: r['path'][3] if len(r['path']) > 3 else None, **{'es:keyword': True}), DF.add_field('breadcrumbs', 'string', lambda r: '/'.join(r['path']) or 'משרדי הממשלה', **{'es:exclude': True}), DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main', **{'es:exclude': True}), DF.delete_fields([ 'path', ]), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', 2020), DF.add_field('kind', 'string', 'gov_social_service_unit', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('score', 'number', 1000, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='units', **{'dpp:streaming': True}), # Ensure we only have the main offices DF.filter_rows(lambda r: r['unit'] is None), DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'), DF.dump_to_path('/var/datapackages/units/social_services'), DF.dump_to_sql(dict(units={'resource-name': 'units'})))
def flow(*_): return DF.Flow( scrape(), DF.update_resource(-1, **{ 'dpp:streaming': True, 'name': 'btl' }), DF.set_type('claim_date', type='datetime', format='%d/%m/%Y %H:%M', resources=-1), DF.set_type('start_date', type='date', format='%d/%m/%Y', resources=-1), DF.filter_rows(lambda r: r['publication_id']), calculate_publication_id(7), )
def flow(*_): return DF.Flow( scraper(), DF.filter_rows(lambda row: row['page_title'] and row['page_title']. startswith('קול קורא'), resources=-1), DF.set_type('start_date', type='date', format='%d/%m/%Y', resources=-1), DF.set_type('claim_date', type='datetime', format='%d/%m/%Y', resources=-1), calculate_publication_id(9), DF.validate(), DF.update_resource(-1, name='negev_galil', **{PROP_STREAMING: True}), )
def flow(*_): return DF.Flow( get_updated_sources(), DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')), DF.validate(), DF.filter_rows(lambda r: r['publication_id']), DF.add_field('tender_type', 'string', lambda r: TENDER_KINDS[r['tender_type_he']], **{'es:keyword': True}), DF.join_with_self( 'tenders', KEY, dict((k, dict(aggregate='last')) for k in list(TENDER_MAPPING.keys()) + ['tender_type'])), DF.set_type('publication_id', type='string', transform=str), DF.set_type('supplier_id', type='string', transform=str), DF.set_type('tender_id', type='string', transform=lambda v: v or 'none'), DF.set_type('.+_date', type='date', format='%d.%m.%Y', on_error=DF.schema_validator.clear), DF.set_type('subjects', type='string', transform=lambda v: ';'.join(x.strip() for x in v.split(',')) if v else ''), DF.set_type('claim_date', type='datetime', transform=lambda v, field_name, row: datetime.datetime. combine(v, row['claim_time'] or datetime.time(0)) if v else None), DF.set_type('tender_type_he', **{'es:keyword': True}), DF.delete_fields(['claim_time']), DF.add_field( 'page_url', 'string', lambda r: f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'), DF.add_field('page_title', 'string', lambda r: r['description']), DF.add_field('reason', 'string', lambda r: r['regulation']), DF.add_field('documents', 'array', []), DF.add_field('contact', 'string'), DF.add_field('contact_email', 'string'), DF.validate(), DF.update_resource(-1, **{'dpp:streaming': True}), DF.printer(), )
def flow(*_): return DF.Flow( services(), DF.delete_fields( ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']), DF.add_field('publisher_name', 'string', lambda r: r['office'], **{'es:keyword': True}), splitter('target_audience'), splitter('subject'), splitter('intervention'), splitter('target_age_group'), floater('beneficiaries'), floater('budgetItems'), floater('manualBudget'), floater('tenders'), floater('suppliers'), floater('virtue_of_table'), fix_suppliers(), fix_tenders(), add_current_budget(), add_current_beneficiaries(), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', CURRENT_YEAR), DF.add_field('kind', 'string', 'gov_social_service', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירות חברתי', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('name', **{'es:title': True}), DF.set_type('description', **{ 'es:itemType': 'string', 'es:boost': True }), DF.add_field('score', 'number', get_score, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='activities', **{'dpp:streaming': True}), DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})), DF.filter_rows(lambda r: not r['deleted']), DF.delete_fields(['deleted']), DF.dump_to_path('/var/datapackages/activities/social_services'), DF.dump_to_sql(dict(activities={'resource-name': 'activities'})), )
def flow(*args): return DF.Flow( get(), DF.filter_rows( lambda row: bool(row['key']) and bool(row.get('title'))), simplify_tags, extract_tags('life_areas', ['Domain']), extract_tags('source_kind', ['Source', 'Resource', 'Resouce']), DF.add_field( 'authors', 'string', lambda r: None if not r.get('creators') else ', '.join( ('{name}'.format(**c) if 'name' in c else '{firstName} {lastName}'.format(**c)) for c in r.get('creators', []) if c.get('creatorType') == 'author')), DF.add_field('item_kind', 'string', lambda r: r.get('reportType') or r.get('itemKind')), DF.concatenate(MAPPING, target={ 'name': 'zotero', 'path': 'zotero.csv' }), DF.dump_to_path('data/zotero'), DF.update_resource(None, **{'dpp:streaming': True}), DF.printer())
def flow(*_): return DF.Flow( scraper(), DF.filter_rows(lambda row: row['page_title'] and row['page_title'].startswith('קול קורא'), resources=-1), page_parser(), DF.add_field('decision', 'string', default=lambda row: row['parsed']['decision'], resources=-1), DF.add_field('start_date', 'date', format='%d/%m/%Y', default=lambda row: row['parsed']['start_date'], resources=-1), DF.add_field('claim_date', 'datetime', format='%d/%m/%Y', default=lambda row: row['parsed']['claim_date'], resources=-1), DF.add_field('documents', 'array', default=lambda row: row['parsed']['documents'], resources=-1), DF.delete_fields(['parsed'], resources=-1), calculate_publication_id(9), DF.validate(), DF.update_resource( -1, name='negev_galil', **{ PROP_STREAMING: True } ), )
def loader(name, cat): return DF.Flow( DF.load('mosadot.xlsx'), DF.concatenate( dict( municipality=['מועצה אזורית'], town=['שם יישוב'], name=['שם המוסד'], kind=['סוג המוסד'], address=['כתובת'], status=['סטטוס'], target_audience=['קהל יעד'], area=['שטח'], lat=['Y'], lon=['X'], )), fixer, category(), DF.filter_rows(lambda r: r['category'] == cat), geo(), # DF.join_with_self('concat', ['kind'], dict(kind=None)), DF.update_resource(-1, name=name, path=name + '.csv'), DF.dump_to_path(name), ).results()[0][0]
import dataflows as DF import glob x = DF.Flow( ({'filename': x[:-4]} for x in glob.glob('*png')), DF.update_resource(-1, name='files'), DF.load('http://api.yodaat.org/data/orgs_in_es/data/orgs.csv', name='orgs'), DF.join( 'files', '{filename}', 'orgs', '{entity_id}', { 'filename': {}, }, full=True, source_delete=True ), DF.filter_rows(lambda row: row['filename'] is None), DF.select_fields(['org_name', 'entity_id']), DF.printer() ).process()
for k in LANGS if row.get(k) ) values = list(set(values)) return values translations = {} for source, gid in sources.items(): url = URL.format(gid) translations[source] = DF.Flow( DF.load(url), clean_row, DF.add_field('values', 'array', default=extract_values), DF.filter_rows(lambda row: row['hebrew']), DF.select_fields(list(LANGS) + ['values']) ).results()[0][0] tx = {} complained = set() for row in translations[source]: v = row.get('values') if not v: continue for vv in v: vv = clean(vv) if tx.get(vv) not in (None, row): if vv not in complained: complained.add(vv) tx[vv] = row if len(complained) > 0:
def base_flow(): sources, *_ = Flow( list_gdrive(), filter_rows(lambda row: ( row['kind'] == 'drive#file' and row['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' )), add_field('filename', 'string', default=lambda row: 'pubfiles/{modifiedTime}-{id}.xlsx'.format(**row)), download_files(), add_field('sheet', 'string'), add_field('headers', 'integer', 1), get_sheets(), ).results() return Flow( *[ load(source['filename'], sheet=source['sheet'], headers=source['headers'], infer_strategy=load.INFER_STRINGS, cast_strategy=load.CAST_TO_STRINGS, name=source['filename']) for source in sources[0] ], filter_rows(lambda row: row.get('migdar_id') not in ('', 'None', None)), load('data/zotero/zotero.csv'), concatenate( fields={ 'migdar_id': [], 'title': ['Title', ], 'bib_title': [], 'bib_related_parts': [], 'notes': [], 'tags': ['Tags'], 'publisher': [], 'languages': ['language_code'], 'item_kind': ['Item Type', 'Item type', 'item_type'], 'pubyear': ['pubyear/pubdate'], 'life_areas': ['Life Domains', 'Domain'], 'source_kind': ['Resource Type', 'Resource type'], 'authors': ['author'], 'url': ['URL'], }, target=dict( name='publications', path='data/publications.csv' ) ), fix_nones(), fix_urls(['url']), set_type('title', **{'es:title': True}), set_type('authors', **{'es:boost': True}), set_type('notes', **{'es:hebrew': True}), set_type('publisher', **{'es:boost': True}), add_field('year', 'integer', default=extract_year), split_and_translate('tags', 'tags', keyword=True, delimiter=','), split_and_translate('life_areas', 'life_areas', keyword=True, delimiter=','), split_and_translate('languages', 'languages', keyword=True, delimiter=' '), split_and_translate('source_kind', 'source_kind', keyword=True), split_and_translate('item_kind', 'item_kind', keyword=True), fix_links('notes'), verify_migdar_id(), add_computed_field([ {'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN}, {'operation': 'format', 'target': 'page_title', 'with': PAGE_TITLE_PATTERN}, ]), add_field('title_kw', 'string', default=lambda row: row.get('title'), **{'es:keyword': True}), )
r['rank'] = i + 1 yield r return func def sort_limit_scores(): def func(row): row['scores'] = sorted(row.get('scores', []), key=lambda r: r['date'])[-30:] return func if __name__ == '__main__': r, _, _ = DF.Flow( DF.load(all_data(), name='cities', headers=1, override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.add_field('score_date', 'object', lambda r: dict( date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted'])) ), DF.concatenate(dict( id=[], city_name=[], score_date=[] ), target=dict(name='ranking')), DF.join_with_self('ranking', '{city_name}', dict( id=None, city_name=None, scores=dict(name='score_date', aggregate='array') )), sort_limit_scores(), DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200), DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']), DF.sort_rows('{sortkey}', reverse=True), DF.delete_fields(['sortkey']), DF.add_field('rank', 'integer', 0),
weeks[-1].append(dict(weekday=weeks[-1][-1]['weekday'] + 1)) assert all(len(x) == 7 for x in weeks) row['scores'] = weeks return func if __name__ == '__main__': r, _, _ = DF.Flow( DF.load(all_data(), name='cities', headers=1, override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.add_field( 'score_date', 'object', lambda r: dict(weekday=r['date'].isoweekday() % 7, date=r['date'].toordinal(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted']))), DF.concatenate(dict(id=[], city_name=[], score_date=[]), target=dict(name='popup_data')), DF.join_with_self( 'popup_data', '{city_name}', dict(id=None, city_name=None, scores=dict(name='score_date', aggregate='array'))), sort_limit_scores(), DF.filter_rows(lambda r: r['scores'] is not None),
return func if __name__ == '__main__': city_fill_color_cases = ['case'] city_fill_pattern_cases = ['case'] neighborhood_fill_color_cases = ['case'] neighborhood_fill_pattern_cases = ['case'] r, _, _ = DF.Flow( DF.load(latest_file(), name='cities', override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.load(latest_file(), name='out', override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.add_field('city_area_id', 'string', lambda r: r['area_id'].split('-')[0]), DF.join('cities', ['city_area_id'], 'out', ['city_area_id'], dict(num_city_reports=dict(name='num_reports_weighted'))), DF.add_field('desc', 'string', ''), DF.add_field('kind', 'string', ''), DF.add_field('property', 'string', ''), props(), DF.join_with_self( 'out', ['is_city', 'kind', 'desc', 'property'], dict(is_city=None,
load('{{input_url}}', format='{{format}}', {% if sheet %}sheet={{sheet}}{% endif %}), {% endif %} {% if input == 'remote' %} load('{{input_url}}', format='{{format}}', {% if sheet %}sheet={{sheet}}{% endif %}), {% endif %} {% if input == 'sql' %} load('{{input_url}}', table='{{input_db_table}}'), {% endif %} {% if input == 'other' %} {% endif %} # Process them (if necessary) {% if 'sort' in processing %} sort_rows('{field_name}'), # Key is a Python format string or a list of field names {% endif %} {% if 'filter' in processing %} filter_rows(), {% endif %} {% if 'find_replace' in processing %} find_replace([ dict(name='field_name', patterns=[ dict(find='re-pattern-to-find', replace='re-pattern-to-replace-with'), ]) ]), {% endif %} {% if 'delete_fields' in processing %} delete_fields(['field_name']), # Pass a list of field names to delete from the data {% endif %} {% if 'set_type' in processing %} set_type('field_name', type='number', constraints=dict(minimum=3)), # There are quite a few options you can use here # Take a look at https://frictionlessdata.io/specs/table-schema/
def process_demographics(stack): key = 'stack:demographics' try: demographics_cards = _cache.get(key) except KeyError: def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f ) def map_to_cards(): MAP = { ("דו''ח אג''ס לפי עולים וותיקים", ("סה''כ עולים",) ): 'immigrants', ("דו''ח אג''ס לפי קבוצות גיל", ('0-5', '6-12') ): 'kids', ("דו''ח אג''ס לפי קבוצות גיל", ('13-17',) ): 'teenagers', ("דו''ח אג''ס לפי קבוצות גיל", ('60-64', '65-69', '70-74', '75-120') ): 'elderly', ("דו''ח אג''ס לפי קבוצות גיל", ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59') ): 'adults', } def f(rows): for row in rows: for (source, kinds), kind in MAP.items(): if row['source'] == source and row['kind'] in kinds: row['kind'] = kind yield row return f s2n = dict( (int(stat_area), f['properties']['title']) for f in get_neighborhood_features() for stat_area in f['properties']['stat_areas'] ) MAP2 = dict( adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0), kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1), teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2), elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3), immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4), ) demographics_cards = DF.Flow( *[ DF.load(f, headers=4) for f in glob.glob('demographics/*.csv') ], DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]), DF.add_field('total', 'number', lambda r: r.get("סה''כ")), DF.delete_fields(["אג''ס", "סה''כ "]), DF.unpivot([dict( name="([-'א-ת0-9 ].+)", keys=dict( kind=r'\1' ) )], [dict( name='kind', type='string' )], dict( name='value', type='number' )), DF.validate(), add_source(), map_to_cards(), DF.concatenate(dict( total=[], value=[], kind=[], stat_id=[] )), DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))), DF.filter_rows(lambda r: r['neighborhood']), DF.join_with_self('concat', ['neighborhood', 'kind'], dict( neighborhood=None, kind=None, total=dict(aggregate='sum'), value=dict(aggregate='sum'), )), DF.duplicate('concat', 'maxes'), DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)), DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict( total=None, )), DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total'] DF.sort_rows('{score_value}', reverse=True), DF.duplicate('maxes', 'demographics'), DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))), DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)), DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']), DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_value=float(r['score_value']), score_display=r['score_display'], geometry_score=float(r['geometry_score']), )), DF.join_with_self('demographics', ['kind'], dict( kind=None, scores=dict(aggregate='array'), )), DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]), DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]), DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]), DF.sort_rows('{order}'), DF.delete_fields(['kind']), ).results()[0][0] _cache.set(key, demographics_cards) # features = [ # dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0])) # for r in DF.Flow( # DF.load('geo/stat-areas/stat-areas/datapackage.json'), # ).results()[0][0] # ] # geometry=dict(type='FeatureCollection', features=features) stack.update(dict( map=True, scheme='green', currentField='neighborhood', layout='scores', # geometry=geometry )) stack.setdefault('cards', []).extend(demographics_cards)
def flow(parameters, *_): def take_first(field): def f(row): if field in row and isinstance(row[field], list): row[field] = row[field][0] return Flow( f, set_type(field, type='string'), ) def datetime_to_date(field): def f(row): if row.get(field): row[field] = row[field].date() return Flow( f, set_type(field, type='date'), ) def approve(parameters): def func(row): if parameters.get('filter-out') is None: return True bad_phrase = parameters['filter-out'] for f in ('page_title', 'description'): if row.get(f) and bad_phrase in row[f]: return False return True return func return Flow( fetcher(parameters), concatenate(dict( page_title=['Title'], publication_id=['ItemId'], tender_id=['ItemUniqueId'], publisher=['OfficeDesc'], start_date=['PublishDate'], claim_date=['LastDate'], decision=['StatusDesc'], description=['Description'], last_update_date=['UpdateDate'], base_url=['BaseUrl'], url_name=['UrlName'], tender_type_he=['PublicationTypeDesc'], ), resources=-1), add_field('tender_type', 'string', default=parameters['tender_type'], resources=-1), take_first('publisher'), take_first('tender_type_he'), add_field('page_url', 'string', default=lambda row: 'https://www.gov.il/he{base_url}{url_name}'.format(**row)), # delete_fields(['base_url', 'url_name']), filter_rows(approve(parameters)), set_type('publication_id', type='integer'), set_type('start_date', type='datetime', format=DATE_FMT), set_type('last_update_date', type='datetime', format=DATE_FMT), set_type('claim_date', type='datetime', format=DATE_FMT), datetime_to_date('last_update_date'), datetime_to_date('start_date'), set_primary_key(['publication_id', 'tender_type', 'tender_id']), dedup(), update_resource(-1, **parameters.pop('resource')), update_resource(-1, **{'dpp:streaming': True}), validate(), )