def clean_data(filename: str, location: str) -> None: """Clean and validate data with `dataflows`, creating data packages in the process, one for each file.""" global FILE_NAME FILE_NAME = f"{location}-{filename}" clean_directory, _, processing_directory = set_location_dirs(location) exported_file = f"{clean_directory}/{filename}" _ = Flow( load( f"{processing_directory}/{filename}.csv", name=FILE_NAME, ), change_path, add_field("NameFIPS", "string"), concat_name_columns, delete_fields(["Name", "FIPS"]), set_type("Data", type="any"), validate(), dump_to_path(exported_file), ).process()[1]
def flow(*_): return Flow( filter_by_type, rename_fields, add_field('stakeholder_type', 'string'), add_fields(FIELDS + OPTIONAL_FIELDS, 'string'), add_fields(TABLE_FIELDS + OPTIONAL_TABLE_FIELDS, 'string'), add_fields(CORE_STAKE_HOLDER_FIELDS.values(), 'string'), validate, parse_document, delete_fields([ 'document', 'pdf', 'other', 'num_files', 'parser_version', 'source', 's3_object_name' ]), fix_fields, set_type('CapitalPct', type='number'), set_type('VotePower', type='number'), set_type('CapitalPct_Dilul', type='number'), set_type('VotePower_Dilul', type='number'), )
def lang_flow(lang, prefix): tags = [dict(doc_id=list(k)) for k in sorted(set( (prefix, x['hebrew'], x[lang]) for x in translations['tags'].values() ))] def add_url(prefix_): def func(rows): for row in rows: if 'url' not in row: yield row elif row.get('doc_id'): row['url'] = 'https://yodaat.org/{}item/{}'.format(prefix_, row['doc_id']) yield row else: print('MMMMMMMM MISSING DOC ID', row) return DF.Flow( DF.add_field('url', 'string', resources=-1), func, ) return DF.Flow( *[ DF.Flow( DF.load('https://api.yodaat.org/data/{}_in_es/data/{}.csv'.format(x, y), name='{}-{}'.format(x, lang)), add_url(prefix) ) for x, y in [ ('publications', 'publications'), ('orgs', 'orgs'), ('datasets', 'out') ] ], tags, DF.add_field('url', 'string', lambda row: 'https://yodaat.org/{}search?tag={}&itag={}&kind=all&filters={{}}&sortOrder=-year'.format(*row.get('doc_id')), resources=-1), DF.update_resource(-1, name='tags-{}'.format(lang)), )
def flow(*_): return DF.Flow( DF.load('/var/datapackages/activities/social_services/historic_data/datapackage.json'), DF.concatenate( dict( kind=[], kind_he=[], activity_name=[], activity_description=[], publisher_name=[], history=[], max_year=[], min_year=[] ), dict( name='activities', path='activities.csv' ) ), DF.set_primary_key(['kind', 'publisher_name', 'activity_name']), DF.set_type('activity_name', **{'es:title': True}), DF.set_type('activity_description', **{'es:itemType': 'string', 'es:boost': True}), DF.set_type('kind', **{'es:keyword': True, 'es:exclude': True}), DF.set_type('kind_he', **{'es:keyword': True, 'es:exclude': True}), DF.set_type('publisher_name', **{'es:keyword': True}), DF.set_type('history', **{ 'es:itemType': 'object', 'es:schema': dict( fields=[ dict(name='year', type='integer'), dict(name='unit', type='string'), dict(name='subunit', type='string'), dict(name='subsubunit', type='string'), dict(name='allocated_budget', type='integer'), dict(name='num_beneficiaries', type='string', **{'es:index': False}), ] ) }), DF.add_field('score', 'number', lambda x: (x['history'][0]['allocated_budget'] or 1000) / 1000, **{'es:score-column': True}), DF.update_resource(-1, **{'dpp:streaming': True}), DF.dump_to_path('/var/datapackages/activities/all'), DF.dump_to_sql(dict( activities={'resource-name': 'activities'} )) )
def flow(*_): return Flow( add_field('date', 'date'), add_field('source', 'string'), add_field('s3_object_name', 'string'), add_field('url', 'string'), add_field('pdf', 'array'), add_field('other', 'array'), add_field('num_files', 'number'), set_primary_key(['s3_object_name']), scrape_maya_notification_list(), set_primary_key(['url']), update_resource(-1, name='maya_notification_list', path="data/maya_notification_list.csv", **{ PROP_STREAMING: True, }), )
def flow(*_): engine = create_engine(os.environ['DPP_DB_ENGINE']) result = engine.execute(query) data = (dict(r) for r in result) codes = dict((i['code'], i) for i in data) logging.info('GOT %d CODES', len(codes)) return DF.Flow( DF.add_field( 'resolved_budget_codes', 'array', default=process_row(codes), **{ 'es:itemType': 'object', 'es:schema': dict(fields=[ dict(name='code', type='string', **{'es:keyword': True}), dict(name='year', type='integer'), dict(name='title', type='string'), dict(name='doc_id', type='string', **{'es:index': False}), ]) }), )
def flow(*_): return DF.Flow( services(), DF.delete_fields( ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']), DF.add_field('publisher_name', 'string', lambda r: r['office'], **{'es:keyword': True}), splitter('target_audience'), splitter('subject'), splitter('intervention'), splitter('target_age_group'), floater('beneficiaries'), floater('budgetItems'), floater('manualBudget'), floater('tenders'), floater('suppliers'), floater('virtue_of_table'), fix_suppliers(), fix_tenders(), add_current_budget(), add_current_beneficiaries(), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', CURRENT_YEAR), DF.add_field('kind', 'string', 'gov_social_service', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירות חברתי', **{ 'es:keyword': True, 'es:exclude': True }), DF.set_type('name', **{'es:title': True}), DF.set_type('description', **{ 'es:itemType': 'string', 'es:boost': True }), DF.add_field('score', 'number', get_score, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='activities', **{'dpp:streaming': True}), DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})), DF.filter_rows(lambda r: not r['deleted']), DF.delete_fields(['deleted']), DF.dump_to_path('/var/datapackages/activities/social_services'), DF.dump_to_sql(dict(activities={'resource-name': 'activities'})), )
def fix_suppliers(): geo = fetch_codelist('geo_region') def func(row): kinds = set() suppliers = row.get('suppliers') or [] eids = set() eids_association = set() eids_company = set() eids_municipality = set() geos = set() for v in suppliers: for f in ['entity_id', 'entity_name']: if v.get(f): v[f] = v[f].replace('<em>', '').replace('</em>', '') v['geo'] = [geo[i] for i in v.get('geo', [])] geos.update(v['geo']) for f in ('year_activity_start', 'year_activity_end'): if f in v and not v[f]: del v[f] start_year = v.get('year_activity_start') or 2020 end_year = v.get('year_activity_end') or CURRENT_YEAR v['activity_years'] = list(range(start_year, end_year + 1)) eid = v['entity_id'] eids.add(eid) ekind = v['entity_kind'] if ekind == 'company': kinds.add('עסקי') eids_company.add(eid) elif ekind in ('association', 'ottoman-association', 'cooperative'): kinds.add('מגזר שלישי') eids_association.add(eid) elif ekind == 'municipality': kinds.add('רשויות מקומיות') eids_municipality.add(eid) else: kinds.add('אחר') row['supplier_count'] = len(eids) row['supplier_count_company'] = len(eids_company) row['supplier_count_association'] = len(eids_association) row['supplier_count_municipality'] = len(eids_municipality) row['geo_coverage'] = 'ארצי' if 'ארצי' in geos else 'אזורי' if len(kinds) == 0: row['supplier_kinds'] = None elif len(kinds) == 1: row['supplier_kinds'] = kinds.pop() else: row['supplier_kinds'] = 'משולב' if len(suppliers) == 0: row['supplier_count_category'] = None elif len(suppliers) == 1: row['supplier_count_category'] = '1' elif 2 <= len(suppliers) <= 5: row['supplier_count_category'] = '2-5' else: row['supplier_count_category'] = '6+' return DF.Flow(DF.add_field('supplier_count_category', 'string'), DF.add_field('supplier_kinds', 'string'), DF.add_field('supplier_count', 'integer'), DF.add_field('supplier_count_company', 'integer'), DF.add_field('supplier_count_association', 'integer'), DF.add_field('supplier_count_municipality', 'integer'), DF.add_field('geo_coverage', 'string'), func)
def flow(): CT = dict([ ('ID_CAPITULO', 'economic-classification:generic:level1:label'), ('DESC_CAPITULO', 'economic-classification:generic:level1:label'), ('ID_CONCEPTO', 'economic-classification:generic:level2:code'), ('DESC_CAPITULO', 'economic-classification:generic:level2:label'), ('ID_PARTIDA_GENERICA', 'economic-classification:generic:level3:code'), ('DESC_PARTIDA_GENERICA', 'economic-classification:generic:level3:label'), ('ID_PARTIDA_ESPECIFICA', 'economic-classification:generic:level4:code'), ('DESC_PARTIDA_ESPECIFICA', 'economic-classification:generic:level4:label'), ]) CN = dict( (k, v.replace(':', '-')) for k, v in CT.items() ) new_columns = [ 'DESC_CAPITULO', 'ID_PARTIDA_GENERICA', 'DESC_PARTIDA_GENERICA', 'ID_PARTIDA_ESPECIFICA', 'DESC_PARTIDA_ESPECIFICA' ] steps = [] steps.extend( add_field(CN[title], 'string', title=title, columnType=CT[title]) for title in new_columns if True # TODO ) lookup = {} codes = datapackage.Package( os.path.join(os.path.dirname(__file__), 'objeto_del_gasto.datapackage.zip') ) for resource in codes.resources: kind = resource.name lookup[kind] = {} for row in resource.iter(keyed=True): key = row[kind.upper().replace('Í', 'I')] value = row['DESCRIPCION'] lookup[kind][key] = value def process(row): year = int(row['date-fiscal-year']) # Skip the LAST year of the dataset (currently 2016) it has split columns already if year < 2019: objeto = row[CN['ID_CONCEPTO']] if objeto: row[CN['ID_CAPITULO']] = objeto[0] + '000' row[CN['ID_CONCEPTO']] = objeto[:2] + '00' row[CN['DESC_CAPITULO']] = lookup['capitulo'].get(row[CN['ID_CAPITULO']]) row[CN['DESC_CONCEPTO']] = lookup['concepto'].get(row[CN['ID_CONCEPTO']]) nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3 if objeto and len(objeto) >= 4: row[CN['ID_PARTIDA_GENERICA']] = objeto[:nb_generica_digits] row[CN['DESC_PARTIDA_GENERICA']] = lookup['partida_generica'].get(row.get(CN['ID_PARTIDA_GENERICA'])) if year not in (2008, 2009, 2010): if objeto and len(objeto) >= 5: row[CN['ID_PARTIDA_ESPECIFICA']] = objeto row[CN['DESC_PARTIDA_ESPECIFICA']] = \ lookup['partida_específica'].get(row.get(CN['ID_PARTIDA_ESPECIFICA'])) steps.append(process) return Flow(*steps)
def download_item_pages(rows): session = HTMLSession() os.makedirs('data/musportal-item-pages/files', exist_ok=True) for rownum, row in enumerate(rows): filename = 'data/musportal-item-pages/files/rownum_{}.html'.format(rownum) if os.path.exists(filename): print('file exists: {}'.format(filename)) row['downloaded_status_code'] = None row['downloaded_html_length'] = None row['downloaded_file_name'] = filename else: status_code, html_content = download_item_page(session, row['item_url']) with open(filename, 'w') as f: f.write(html_content) print('saved file: {}'.format(filename)) row['downloaded_status_code'] = status_code row['downloaded_html_length'] = len(html_content) row['downloaded_file_name'] = filename yield row print(Flow( load('musportal/.checkpoints/all_page_items/datapackage.json'), add_field('downloaded_status_code', 'integer'), add_field('downloaded_html_length', 'integer'), add_field('downloaded_file_name', 'string'), download_item_pages, printer(), dump_to_path('data/musportal-item-pages'), ).process()[1])
def process_institutions(stack): key = 'stack:institutions' try: institutions_cards = _cache.get(key) except KeyError: CRS = '+ellps=GRS80 +k=1.00007 +lat_0=31.73439361111111 +lon_0=35.20451694444445 +no_defs +proj=tmerc +units=m +x_0=219529.584 +y_0=626907.39' projector = pyproj.Proj(CRS) def proj(): def func(row): row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True) return DF.Flow( DF.add_field('lon', 'number'), DF.add_field('lat', 'number'), func, DF.delete_fields(['X', 'Y']) ) def translate_kind(): translations = { 'מרפאה': 'מרפאות', 'איצטדיון': 'איצטדיון', 'ספרייה': 'ספריות', 'בית ספר': 'בתי ספר', 'מועדון קהילתי כולל מרכז צעירים': 'מועדון קהילתי', 'בית כנסת': 'בתי כנסת', 'מועדון נוער': 'מועדון נוער', 'אולם מופעים, היכל תרבות': 'מוסדות תרבות', 'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'מרכזי פעילות לקשישים', } def func(row): row['kind'] = translations[row['kind']] return func institutions_cards = DF.Flow( *[ DF.load(f) for f in glob.glob('institutions/*xlsx') ], DF.concatenate(dict( kind=['סוג המוסד'], title=['שם המוסד'], address=['כתובת'], X=[], Y=[] )), translate_kind(), proj(), DF.add_field('feature', 'object', lambda r: geojson.Feature( properties=dict(title=r['title'], address=r['address']), geometry=geojson.Point(coordinates=[float(r['lon']), float(r['lat'])]) )), DF.delete_fields(['title', 'lon', 'lat', 'address']), DF.join_with_self('concat', ['kind'], dict( title=dict(name='kind'), features=dict(name='feature', aggregate='array') )), DF.sort_rows('{title}', reverse=True), DF.add_field('pointGeometry', 'object', lambda r: geojson.FeatureCollection(features=r['features'])), DF.add_field('content', 'string', ' '), DF.delete_fields(['features']), # DF.printer(tablefmt='html') ).results()[0][0] _cache.set(key, institutions_cards) stack.update(dict( map=True, )) stack.setdefault('cards', []) current_cards = dict( (c['title'], c) for c in stack['cards'] ) for card in institutions_cards: current_card = current_cards.pop(card['title'], None) if current_card is not None: card['content'] = current_card['content'] else: print('SPURIOUS CARD for INSTITUTIONS', card['title']) stack['cards'] = [ c for c in stack['cards'] if c['title'] in current_cards ] + institutions_cards
def flow(self): steps = [] if not self.config.get(CONFIG_PUBLISH_ALLOWED): return None logger.info('Publisher Flow Preparing') if self.output_datapackage: logger.info('Publisher Flow: Dump To Path Denorm...') steps.extend([ dump_to_path(self.output_datapackage) ]) if self.output_db: db_table = 'dgp__{}_{}'.format( self.config.get(CONFIG_TAXONOMY_ID), self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME), ) logger.info('Publisher Flow: Dump To DB... (%s)', db_table) primary_key = self.config.get(CONFIG_PRIMARY_KEY) mapping = self.config.get(CONFIG_MODEL_MAPPING) for m in mapping: if 'columnType' in m and m['columnType']: m['slug'] = self.slugify(m['title']) m['hierarchy'] = self.slugify(m['columnType'].split(':')[0]) m['column'] = self.column(m['columnType']) m['primaryKey'] = m['columnType'] in primary_key m['measure'] = m['hierarchy'] == 'value' m['full_column'] = ( m['column'] if m['measure'] else '{}_{hierarchy}.{column}'.format(db_table, **m) ) m['label'] = self.fetch_label(m['columnType']) m['dataType'] = self.fetch_datatype(m['columnType']) prefixes = set( m['hierarchy'] for m in mapping if m.get('measure') is False ) prefixed = dict( (p, list(filter(lambda m: m.get('hierarchy') == p, mapping))) for p in prefixes ) groups = [ NormGroup([ m['column'] for m in prefixed_items ], self.ref_column(prefix), self.id_column(), db_table='{}_{}'.format(db_table, prefix)) for prefix, prefixed_items in prefixed.items() ] babbage_model = dict( dimensions=dict( (m['slug'], dict( label=m['title'], key_attribute=m['slug'], attributes=dict([ (m['slug'], dict( column=m['full_column'], label=m['title'], type=m['dataType'], )) ] + ([ (m['label']['slug'], dict( column=m['label']['full_column'], label=m['label']['title'], type=m['label']['dataType'], )) ] if m.get('label') else [])), join_column=[ self.ref_column(m['hierarchy']), self.id_column() ], **(dict( label_attribute=m['label']['slug'] ) if m.get('label') else {}) )) for m in self.config.get(CONFIG_MODEL_MAPPING) if m.get('measure') is False and m.get('primaryKey') is True ), fact_table=db_table, measures=dict( ( m['slug'], dict( column=m['column'], label=m['title'], type='number' ) ) for m in self.config.get(CONFIG_MODEL_MAPPING) if m.get('measure') is True ), hierarchies=dict( (prefix, dict( label=prefix, levels=[ m['slug'] for m in prefixed_items if m.get('primaryKey') is True ] )) for prefix, prefixed_items in prefixed.items() ), ) steps.append( update_package(babbage_model=babbage_model) ) source = self.config.get(CONFIG_URL) logger.info('Publisher Flow: _source Handling...') steps.extend([ add_field('_source', 'string', source), append_to_primary_key(['_source']), clear_by_source(self.lazy_engine(), db_table, source), ]) logger.info('Publisher Flow: Normalize...') steps.extend([ normalize_to_db( groups, db_table, RESOURCE_NAME, self.output_db, 'append' ), ]) if self.output_datapackage: logger.info('Publisher Flow: Dump To Path Norm...') steps.extend([ dump_to_path(self.output_datapackage + '-norm') ]) if self.output_es: logger.info('Publisher Flow: ES...') steps.extend([ self.update_es() ]) logger.info('Publisher Flow Prepared') return Flow(*steps)
URL_PATTERN = 'https://docs.google.com/spreadsheets/d/{id}/edit#gid={gid}' # %% loads = [] i = 0 for source in SOURCES: for sheet in source['sheets']: i += 1 resource_name = 'res_{}'.format(i) url = source['filename'] loads.append((resource_name, DF.Flow( DF.load(url, name=resource_name, **sheet.get('options', {})), DF.add_field('year', 'integer', source['year']), DF.add_field('publisher_name', 'string', sheet['office']), ))) FIELD_MAPPING = dict( year=[], publisher_name=[], unit=[ 'מינהל/ חטיבה', 'מינהל/ אגף', 'שם מינהל האגף', 'מנהל / אגף', 'מינהל' ], subunit=['אגף', 'אגף/ מחלקה', 'שם האגף / מחלקה', 'אגף / מחלקה'], subsubunit=['מחלקה'], activity_name=[ 'שם השירות', 'שם השירות החברתי', 'שם השירות חברתי', 'שירות חברתי' ],
return func if __name__ == '__main__': r, _, _ = DF.Flow( DF.load(all_data(), name='cities', headers=1, override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.add_field( 'score_date', 'object', lambda r: dict(weekday=r['date'].isoweekday() % 7, date=r['date'].toordinal(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted']))), DF.concatenate(dict(id=[], city_name=[], score_date=[]), target=dict(name='popup_data')), DF.join_with_self( 'popup_data', '{city_name}', dict(id=None, city_name=None, scores=dict(name='score_date', aggregate='array'))), sort_limit_scores(), DF.filter_rows(lambda r: r['scores'] is not None), DF.add_field('nr', 'integer', lambda r: r['scores'][-1]['nr']), DF.add_field('sr', 'number', lambda r: r['scores'][-1]['sr']), split_to_weeks(), DF.add_field('translations', 'object',
type=str, help="Source ID for filter CKAN API") args = parser.parse_args() config.SOURCE_NAME = args.name # Nice name of the source config.CKAN_CATALOG_URL = args.catalog_url config.SOURCE_ID = args.harvest_source_id res = Flow( # add other resource to this process. The packages list from data.gov get_current_ckan_resources_from_api(harvest_source_id=config.SOURCE_ID), update_resource('res_1', name='ckan_results'), # new field at this copy for comparasion results add_field(name='comparison_results', type='object', resources='ckan_results'), # Compare both resources # In data.json the datasets have the identifier field: "identifier": "USDA-ERS-00071" # In CKAN API results the datasets have the same identifier at "extras" list: {"key": "identifier", "value": "USDA-ERS-00071"} compare_resources, ).results() # save results # comparison results dmp = json.dumps(res[0][0], indent=2) f = open(config.get_flow2_datasets_result_path(), 'w') f.write(dmp) f.close()
def flow(*_): return DF.Flow( all_units(), DF.add_field('office', 'string', lambda r: r['path'][0] if len(r['path']) > 0 else None, **{'es:keyword': True}), DF.add_field('unit', 'string', lambda r: r['path'][1] if len(r['path']) > 1 else None, **{'es:keyword': True}), DF.add_field('subunit', 'string', lambda r: r['path'][2] if len(r['path']) > 2 else None, **{'es:keyword': True}), DF.add_field('subsubunit', 'string', lambda r: r['path'][3] if len(r['path']) > 3 else None, **{'es:keyword': True}), DF.add_field('breadcrumbs', 'string', lambda r: '/'.join(r['path']) or 'משרדי הממשלה', **{'es:exclude': True}), DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main', **{'es:exclude': True}), DF.delete_fields([ 'path', ]), DF.add_field('min_year', 'integer', 2020), DF.add_field('max_year', 'integer', 2020), DF.add_field('kind', 'string', 'gov_social_service_unit', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{ 'es:keyword': True, 'es:exclude': True }), DF.add_field('score', 'number', 1000, **{'es:score-column': True}), DF.set_primary_key(['kind', 'id']), DF.update_resource(-1, name='units', **{'dpp:streaming': True}), # Ensure we only have the main offices DF.filter_rows(lambda r: r['unit'] is None), DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'), DF.dump_to_path('/var/datapackages/units/social_services'), DF.dump_to_sql(dict(units={'resource-name': 'units'})))
def objeto_del_gasto(config): logging.info('PREPARING objeto_del_gasto processing') CT = COLUMN_MAPPING CN = dict((k, v.replace(':', '-')) for k, v in CT.items()) lookup = {} codes = datapackage.Package( os.path.join(os.path.dirname(__file__), 'objeto_del_gasto.datapackage.zip')) for resource in codes.resources: kind = resource.name lookup[kind] = {} for row in resource.iter(keyed=True): key = row[kind.upper().replace('Í', 'I')] value = row['DESCRIPCION'] lookup[kind][key] = value def process(row): year = int(row['date-fiscal-year']) # Skip the LAST year of the dataset (currently 2016) it has split columns already if year < 2019: objeto = row[CN['ID_CONCEPTO']] if objeto and objeto != '-': row[CN['ID_CAPITULO']] = objeto[0] + '000' row[CN['ID_CONCEPTO']] = objeto[:2] + '00' row[CN['DESC_CAPITULO']] = lookup['capitulo'].get( row[CN['ID_CAPITULO']]) row[CN['DESC_CONCEPTO']] = lookup['concepto'].get( row[CN['ID_CONCEPTO']]) nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3 if objeto and len(objeto) >= 4: row[CN['ID_PARTIDA_GENERICA']] = objeto[:nb_generica_digits] row[CN['DESC_PARTIDA_GENERICA']] = lookup['partida_generica'].get( row.get(CN['ID_PARTIDA_GENERICA'])) if year not in (2008, 2009, 2010): if objeto and len(objeto) >= 5: row[CN['ID_PARTIDA_ESPECIFICA']] = objeto row[CN['DESC_PARTIDA_ESPECIFICA']] = \ lookup['partida_específica'].get(row.get(CN['ID_PARTIDA_ESPECIFICA'])) def missing_field(mf): def func(dp): return all(f.name != mf for f in dp.resources[0].schema.fields) return func def sort_by_ct(): def func(package): ct_indexes = dict( (ct['name'], i) for i, ct in enumerate(config.get(CONFIG_TAXONOMY_CT))) fields = sorted(((ct_indexes.get(f.get('columnType'), 1000), f) for f in package.pkg.descriptor['resources'][0] ['schema']['fields']), key=lambda x: x[0]) package.pkg.descriptor['resources'][0]['schema']['fields'] = [ f[1] for f in fields ] yield package.pkg yield from package return func return Flow( *[ conditional( missing_field(CN[f]), Flow(add_field(CN[f], 'string', columnType=ct, title=f), append_to_primary_key(CN[f]) if 'ID_' in f else None)) for f, ct in CT.items() ], sort_by_ct(), process)
def postflow(self): metadata = self.config._unflatten().get('extra', {}).get('metadata', {}) return Flow(add_field('metadata', 'object', metadata))
def process_stack_demand(stack): def collect_cats(): F = 'כלל המדגם' def f(rows): cat = None for row in rows: if F in row: v = row[F] if v.startswith('סך הכל '): cat = v[7:] elif v.startswith('--- '): if not v.endswith('ללא פירוט'): subcat = v[4:] row['category'] = cat row['subcategory'] = subcat yield row else: yield row return DF.Flow( DF.add_field('category', 'string', resources=-1), DF.add_field('subcategory', 'string', resources=-1), f, DF.delete_fields([F], resources=-1), ) def fix_nones(row): row['demand_pct'] = row['demand_pct'] or 0 key = 'stack:demand' try: demand_stacks = _cache.get(key) except KeyError: demand_stacks = DF.Flow( DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2), collect_cats(), DF.update_schema(-1, missingValues=['--']), DF.unpivot( unpivot_fields=[dict( name='(.+) \\([A-Z]\\)', keys=dict( neighborhood='\\1' ), )], extra_keys=[dict( name='neighborhood', type='string' )], extra_value=dict( name='demand_pct', type='number' ), resources=-1 ), DF.validate(), DF.duplicate('demand', 'demand_stacks'), DF.join_with_self('demand', ['category', 'subcategory'], dict( category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max') )), DF.join( 'demand', ['category', 'subcategory'], 'demand_stacks', ['category', 'subcategory'], dict( max_demand=None ) ), fix_nones, DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)), DF.add_field('value', 'number', lambda r: r['demand_pct']), DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6), DF.delete_fields(['demand_pct', 'max_demand']), DF.sort_rows('{score}', reverse=True), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_display=r['display'], score_value=float(r['value']), geometry_score=float(r['score']), )), DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict( category=None, subcategory=None, scores=dict(aggregate='array'), )), DF.add_field('card', 'object', lambda r: dict( title='ביקוש ל{}'.format(r['subcategory']), content='', scores=r['scores'], test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_') )), DF.join_with_self('demand_stacks', ['category'], dict( category=None, cards=dict(name='card', aggregate='array'), )), DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')), ).results()[0][0] _cache.set(key, demand_stacks) cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards'] stack.update(dict( layout='scores', currentField='neighborhood', map=True )) stack.setdefault('cards', []).extend(cards)
def _convert_type(cls, schema_type, field, prefix): prop = super()._convert_type(schema_type, field, prefix) if field.get('es:keyword'): prop['type'] = 'keyword' elif schema_type in ('number', 'integer'): prop['index'] = True return prop if __name__ == '__main__': DF.Flow( DF.load('new-york-city-current-job-postings.zip', filename='nyc-jobs.csv', name='jobs'), DF.add_field('doc_id', 'string', default=lambda row: 'job/{Job ID}'.format(**row)), DF.add_field('score', 'integer', default=1), DF.set_type('Salary Frequency', **{'es:keyword': True}), DF.set_primary_key(['doc_id']), dump_to_es(indexes={'jobs-job': [{ 'resource-name': 'jobs', }]}, mapper_cls=SampleMappingGenerator), DF.dump_to_path('data'), DF.add_field('value', 'object', default=lambda row: dict((k, v) for k, v in row.items() if k not in ('doc_id', 'score')), **{'es:index': False}), DF.select_fields(['doc_id', 'value']), dump_to_es(indexes={'jobs-document': [{
def flow(*_): DF.Flow( DF.load(filename, name='welfare'), DF.add_field('activity_name', 'string', lambda r: r['שם השירות (ציבורי)']), DF.filter_rows(lambda r: r['activity_name']), DF.add_field( 'activity_description', 'array', lambda r: [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)'] ]), DF.add_field( 'history', 'array', lambda r: [ dict( year=2019, unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(), subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1]. strip(), subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[ 1].strip(), ) ]), DF.add_field('target_audience', 'array', splitter('אוכלוסייה')), DF.add_field('subject', 'array', splitter('תחום ההתערבות')), DF.add_field('intervention', 'array', splitter('אופן התערבות')), DF.select_fields(FIELDS), DF.add_field('publisher_name', 'string', 'משרד הרווחה'), DF.add_field('min_year', 'integer', 2019), DF.add_field('max_year', 'integer', 2019), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(), DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process() return DF.Flow( DF.load('tmp/activities-welfare/datapackage.json'), DF.update_resource(-1, **{'dpp:streaming': True}), )
def test_add_field(): from dataflows import Flow, add_field f = Flow( (dict(a=i) for i in range(3)), add_field('b', 'string', 'b'), add_field('c', 'number'), add_field('d', 'boolean', title='mybool'), ) results, dp, _ = f.results() assert results == [[{ 'a': 0, 'b': 'b', 'c': None, 'd': None }, { 'a': 1, 'b': 'b', 'c': None, 'd': None }, { 'a': 2, 'b': 'b', 'c': None, 'd': None }]] assert dp.descriptor == \ { 'profile': 'data-package', 'resources': [ { 'name': 'res_1', 'path': 'res_1.csv', 'profile': 'tabular-data-resource', 'schema': { 'fields': [ { 'format': 'default', 'name': 'a', 'type': 'integer' }, { 'format': 'default', 'name': 'b', 'type': 'string' }, { 'format': 'default', 'name': 'c', 'type': 'number' }, { 'format': 'default', 'name': 'd', 'title': 'mybool', 'type': 'boolean' } ], 'missingValues': [''] } } ] }
def flow(parameters, *_): def take_first(field): def f(row): if field in row and isinstance(row[field], list): row[field] = row[field][0] return Flow( f, set_type(field, type='string'), ) def datetime_to_date(field): def f(row): if row.get(field): row[field] = row[field].date() return Flow( f, set_type(field, type='date'), ) def approve(parameters): def func(row): if parameters.get('filter-out') is None: return True bad_phrase = parameters['filter-out'] for f in ('page_title', 'description'): if row.get(f) and bad_phrase in row[f]: return False return True return func return Flow( fetcher(parameters), concatenate(dict( page_title=['Title'], publication_id=['ItemId'], tender_id=['ItemUniqueId'], publisher=['OfficeDesc'], start_date=['PublishDate'], claim_date=['LastDate'], decision=['StatusDesc'], description=['Description'], last_update_date=['UpdateDate'], base_url=['BaseUrl'], url_name=['UrlName'], tender_type_he=['PublicationTypeDesc'], ), resources=-1), add_field('tender_type', 'string', default=parameters['tender_type'], resources=-1), take_first('publisher'), take_first('tender_type_he'), add_field('page_url', 'string', default=lambda row: 'https://www.gov.il/he{base_url}{url_name}'.format(**row)), # delete_fields(['base_url', 'url_name']), filter_rows(approve(parameters)), set_type('publication_id', type='integer'), set_type('start_date', type='datetime', format=DATE_FMT), set_type('last_update_date', type='datetime', format=DATE_FMT), set_type('claim_date', type='datetime', format=DATE_FMT), datetime_to_date('last_update_date'), datetime_to_date('start_date'), set_primary_key(['publication_id', 'tender_type', 'tender_id']), dedup(), update_resource(-1, **parameters.pop('resource')), update_resource(-1, **{'dpp:streaming': True}), validate(), )
'video_url': '', 'main_image_url': '', 'preview_image_url': '', 'image_urls': [], 'item_url_he': '', 'item_url_en': row['URL'] } else: yield from rows if __name__ == '__main__': run_dump_print( Flow( load('data/parse_cached_apis/datapackage.json'), add_field('item_url_he', 'string'), add_field('item_url_en', 'string'), add_ethiopia_familynames, dump_cached_api('czeck-cached-api', 'bhjs-content/places/czech/cached-api.php'), dump_cached_api('ethiopia-cached-api', 'bhjs-content/places/ethiopia/cached-api.php'), ), 'data/dump_cached_apis', fields=['UnitType', 'UnitTypeDesc', 'header_en', 'header_he'], num_rows=1, resources=['ethiopia-cached-api']) print( 'Saved updated cached-api.php files in bhjs-content/places/*/cached-api.php' )
def flow(*_): return DF.Flow( DF.load(URL, format='json', property='jData', name='education'), # DF.checkpoint('education'), DF.concatenate(dict( page_title=['Title'], start_date=['PobKKPublishingDate'], claim_date=['PobLastDate'], target_audience_x=['PobBudgetEntitties'], description=['PobTaktzir'], email=['PobPedagogyContactHtml'], publishing_unit_x=['PobYechida'], budget_code_x=['PobTakanaTaktzivitString'], att_title=['PobCreteriaLink_description'], att_url=['PobCreteriaLink_url'], ), resources=-1, target=dict(name='education')), DF.add_field('page_url', 'string', PAGE_URL, resources=-1), DF.add_field('publisher', 'string', 'משרד החינוך', resources=-1), DF.add_field('tender_type', 'string', 'call_for_bids', resources=-1), DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1), DF.add_field('publication_id', 'integer', 0, resources=-1), DF.add_field('tender_id', 'string', '0', resources=-1), DF.add_field('tender_type_he', 'string', 'קול קורא', resources=-1), DF.add_field('contact', 'string', lambda row: extract_hebrew(row, 'email'), resources=-1), DF.add_field('target_audience', 'string', lambda row: extract_hebrew(row, 'target_audience_x'), resources=-1), DF.add_field('contact_email', 'string', lambda row: extract_email(row, 'email'), resources=-1), DF.add_field('publishing_unit', 'string', lambda row: row['publishing_unit_x'][0]['PobYechida'], resources=-1), DF.add_field('budget_code', 'string', lambda row: extract_budget_code(row, 'budget_code_x'), resources=-1), DF.set_type('start_date', type='date', format='%d/%m/%Y %H:%M:%S'), DF.set_type('claim_date', type='datetime', format='%d/%m/%Y %H:%M:%S'), DF.add_field('documents', 'array', lambda row: [ dict(description=row['att_title'], link=row['att_url'], update_time=str(row['start_date'])) ], resources=-1), DF.delete_fields([ 'email', 'publishing_unit_x', 'budget_code_x', 'att_title', 'att_url', 'target_audience_x' ], resources=-1), calculate_publication_id(6), DF.update_resource(-1, **{'dpp:streaming': True}))
def process_demographics(stack): key = 'stack:demographics' try: demographics_cards = _cache.get(key) except KeyError: def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f ) def map_to_cards(): MAP = { ("דו''ח אג''ס לפי עולים וותיקים", ("סה''כ עולים",) ): 'immigrants', ("דו''ח אג''ס לפי קבוצות גיל", ('0-5', '6-12') ): 'kids', ("דו''ח אג''ס לפי קבוצות גיל", ('13-17',) ): 'teenagers', ("דו''ח אג''ס לפי קבוצות גיל", ('60-64', '65-69', '70-74', '75-120') ): 'elderly', ("דו''ח אג''ס לפי קבוצות גיל", ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59') ): 'adults', } def f(rows): for row in rows: for (source, kinds), kind in MAP.items(): if row['source'] == source and row['kind'] in kinds: row['kind'] = kind yield row return f s2n = dict( (int(stat_area), f['properties']['title']) for f in get_neighborhood_features() for stat_area in f['properties']['stat_areas'] ) MAP2 = dict( adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0), kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1), teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2), elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3), immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4), ) demographics_cards = DF.Flow( *[ DF.load(f, headers=4) for f in glob.glob('demographics/*.csv') ], DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]), DF.add_field('total', 'number', lambda r: r.get("סה''כ")), DF.delete_fields(["אג''ס", "סה''כ "]), DF.unpivot([dict( name="([-'א-ת0-9 ].+)", keys=dict( kind=r'\1' ) )], [dict( name='kind', type='string' )], dict( name='value', type='number' )), DF.validate(), add_source(), map_to_cards(), DF.concatenate(dict( total=[], value=[], kind=[], stat_id=[] )), DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))), DF.filter_rows(lambda r: r['neighborhood']), DF.join_with_self('concat', ['neighborhood', 'kind'], dict( neighborhood=None, kind=None, total=dict(aggregate='sum'), value=dict(aggregate='sum'), )), DF.duplicate('concat', 'maxes'), DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)), DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict( total=None, )), DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total'] DF.sort_rows('{score_value}', reverse=True), DF.duplicate('maxes', 'demographics'), DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))), DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)), DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']), DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])), DF.add_field('scores', 'object', lambda r: dict( title=r['neighborhood'], score_value=float(r['score_value']), score_display=r['score_display'], geometry_score=float(r['geometry_score']), )), DF.join_with_self('demographics', ['kind'], dict( kind=None, scores=dict(aggregate='array'), )), DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]), DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]), DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]), DF.sort_rows('{order}'), DF.delete_fields(['kind']), ).results()[0][0] _cache.set(key, demographics_cards) # features = [ # dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0])) # for r in DF.Flow( # DF.load('geo/stat-areas/stat-areas/datapackage.json'), # ).results()[0][0] # ] # geometry=dict(type='FeatureCollection', features=features) stack.update(dict( map=True, scheme='green', currentField='neighborhood', layout='scores', # geometry=geometry )) stack.setdefault('cards', []).extend(demographics_cards)
def flow(*_): return Flow( update_resource( -1, name='maya_tase_companies_current_management', path="data/maya_tase_companies_current_management.csv", ), add_field('CompanyLongName', 'string'), add_field('CorporateNo', 'string'), add_field('Site', 'string'), add_field('CapitalPercent', 'string'), add_field('EndBalance', 'string'), add_field('Id', 'string'), add_field('IsFinancialExpert', 'number'), add_field('IsInspectionComitee', 'number'), add_field('IsManager', 'boolean'), add_field('Name', 'string'), add_field('RoleType', 'string'), add_field('SecurityName', 'string'), add_field('VoteCapital', 'string'), process_companies)
return func def sort_limit_scores(): def func(row): row['scores'] = sorted(row.get('scores', []), key=lambda r: r['date'])[-30:] return func if __name__ == '__main__': r, _, _ = DF.Flow( DF.load(all_data(), name='cities', headers=1, override_fields=dict(area_id=dict(type='string')), cast_strategy=DF.load.CAST_WITH_SCHEMA), DF.filter_rows(lambda r: r['is_city']), DF.add_field('score_date', 'object', lambda r: dict( date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted'])) ), DF.concatenate(dict( id=[], city_name=[], score_date=[] ), target=dict(name='ranking')), DF.join_with_self('ranking', '{city_name}', dict( id=None, city_name=None, scores=dict(name='score_date', aggregate='array') )), sort_limit_scores(), DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200), DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']), DF.sort_rows('{sortkey}', reverse=True), DF.delete_fields(['sortkey']), DF.add_field('rank', 'integer', 0), DF.add_field('translations', 'object', lambda r: city_translations[r['city_name']]), DF.add_field('image', 'object', lambda r: upload_static_image(r['id'], width=280*2, height=160*2)),
row['doc_id'] += '.{}'.format(used[doc_id]) yield row used[doc_id] += 1 cur_year = datetime.date.today().year org_flow = DF.Flow( DF.load(ORGS_URL, name='orgs'), DF.concatenate(headers, resources='orgs', target=dict(name='orgs')), fix_urls(['org_website', 'org_facebook']), DF.add_field( 'alt_names', 'array', default=lambda r: [ r[x] for x in [ 'alt_name%d' % i for i in range(1, 6) ] + ['org_name'] if x in r and r[x] ] ), DF.add_field('compact_services', 'string', lambda row: row.get('provided_services')), DF.delete_fields(['alt_name[1-5]']), *[ split_and_translate( f, f, delimiter=',', keyword=f in ('org_kind', 'life_areas', 'languages', 'tags', 'compact_services') ) for f in ('languages', 'life_areas', 'tags', 'regions', 'org_kind', 'specialties', 'provided_services', 'target_audiences', 'compact_services')
def add_gps_coordinates(stats, kv, parameters): logging.info('adding gps coordinates') def _add_gps_coordinates(rows): logging.info("resource name = " + rows.res.name) if rows.res.name == "db_data": source = "db" else: source = rows.res.name.split("__")[0] fields = parameters["source_fields"][source] workplace_fields = parameters.get("workplace_source_fields", {}).get(source) if workplace_fields and source != "db": raise Exception("sorry, wokrplace_fields is only supported for db source") for row in rows: inputs = {} workplace_inputs = {} for k, v in row.items(): input = fields.get(k.strip()) if input and v and v.strip(): if input in inputs: logging.warning("duplicate input %s, %s: %s" % (source, input, row)) elif source == "db": inputs[input] = json.loads(v) else: inputs[input] = v if workplace_fields: input = workplace_fields.get(k.strip()) if input and v and v.strip(): if input in workplace_inputs: logging.warning("duplicate workplace_input %s, %s: %s" % (source, input, row)) elif source == "db": workplace_inputs[input] = json.loads(v) else: workplace_inputs[input] = v lat, lng, accurate = get_coords(stats, kv, inputs, get_coords_callback=parameters.get("get-coords-callback")) if workplace_fields: workplace_lat, workplace_lng, workplace_accurate = get_coords(stats, kv, workplace_inputs, get_coords_callback=parameters.get("get-coords-callback")) yield { **row, "lat": str(lat), "lng": str(lng), **({"address_street_accurate": str(accurate)} if source == "db" else {}), **({ "workplace_lat": str(workplace_lat), "workplace_lng": str(workplace_lng), **({"workplace_street_accurate": str(workplace_accurate)} if source == "db" else {}), } if workplace_fields else {}), } logging.info(str(dict(stats))) flow_args = [] if parameters.get('load_db_data'): flow_args += [ load(os.path.join(parameters['load_db_data'], 'datapackage.json')) ] if parameters.get('load_gdrive_data'): flow_args += [ load(os.path.join(parameters['load_gdrive_data'], 'datapackage.json')) ] flow_args += [ add_field('lat', 'string', default="0"), add_field('lng', 'string', default="0"), add_field('address_street_accurate', 'string', default="0", resources="db_data"), add_field('workplace_lat', 'string', default="0", resources="db_data"), add_field('workplace_lng', 'string', default="0", resources="db_data"), add_field('workplace_street_accurate', 'string', default="0", resources="db_data"), _add_gps_coordinates, ] if parameters.get('dump_to_path'): flow_args += [ dump_to_path(parameters['dump_to_path']) ] return Flow(*flow_args)