def flow(*_): DF.Flow( DF.load(filename, name='welfare'), DF.add_field('activity_name', 'string', lambda r: r['שם השירות (ציבורי)']), DF.filter_rows(lambda r: r['activity_name']), DF.add_field( 'activity_description', 'array', lambda r: [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)'] ]), DF.add_field( 'history', 'array', lambda r: [ dict( year=2019, unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(), subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1]. strip(), subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[ 1].strip(), ) ]), DF.add_field('target_audience', 'array', splitter('אוכלוסייה')), DF.add_field('subject', 'array', splitter('תחום ההתערבות')), DF.add_field('intervention', 'array', splitter('אופן התערבות')), DF.select_fields(FIELDS), DF.add_field('publisher_name', 'string', 'משרד הרווחה'), DF.add_field('min_year', 'integer', 2019), DF.add_field('max_year', 'integer', 2019), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(), DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process() return DF.Flow( DF.load('tmp/activities-welfare/datapackage.json'), DF.update_resource(-1, **{'dpp:streaming': True}), )
def broken_links_flow(): return DF.Flow( *[ DF.Flow( DF.load(URL_TEMPLATE.format(**c), name=c['name']), DF.add_field('__name', 'string', c['name'], resources=c['name']), DF.add_field('__title', 'string', get_title(c['title']), resources=c['name']), ) for c in configuration ], DF.add_field('urls', 'array', lambda r: RE.findall(str(r))), DF.add_field('link', 'string', lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)), DF.concatenate( dict( name=['__name'], title=['__title'], link=[], urls=[], )), DF.add_field('url', 'string'), DF.add_field('error', 'string'), unwind(), DF.delete_fields(['urls']), DF.parallelize(check_broken(), 4), DF.filter_rows(lambda r: r['error'] is not None), )
def main_stats(num_files, file_size, download_iterations, download_threads, output_dir, only_upload, only_download, **kwargs): download_report_filename, upload_report_filename = None, None for filename in glob(os.path.join(output_dir, '*.csv')): if '/download-report-' in filename: assert not download_report_filename download_report_filename = filename elif '/upload-report-' in filename: assert not upload_report_filename upload_report_filename = filename assert download_report_filename and upload_report_filename print('upload_report_filename', upload_report_filename) print('download_report_filename', download_report_filename) print("Generating upload stats...") upload_stats = defaultdict(int) df.Flow(df.load(upload_report_filename), stats_process_upload_rows(upload_stats)).process() print("Generating download stats...") download_stats = defaultdict(int) df.Flow(df.load(download_report_filename), stats_process_download_rows(download_stats, file_size)).process() print("Upload Stats") pprint(dict(upload_stats)) print("Download Stats") pprint(dict(download_stats))
def get_updated_sources(): import requests from pyquery import PyQuery as pq URL = 'https://mr.gov.il/ilgstorefront/he/news/details/230920201036' sources = [] page = pq(requests.get(URL).text) anchors = page.find('a') for anchor in anchors: anchor = pq(anchor) href = anchor.attr('href') if '.zip' in href: sources.append(href + '#.xlsx') sources = [ DF.load(source, format='excel-xml', encoding='utf8', bytes_sample_size=0) for source in sources ] if len(sources) != 2: return DF.Flow( data_gov_il_resource.flow(tenders), data_gov_il_resource.flow(exemptions), ) else: return DF.Flow(*sources)
def extract_tags(field='tags', prefixes=None): if prefixes is not None: def remove_prefix(row): row['tags'] = [ x for x in row['tags'] if all(not x.startswith('{}_'.format(prefix)) for prefix in prefixes) ] def collect(rows): options = set() for row in rows: options.update(row[field]) yield row print('OPTIONS FOR {}: {}'.format(field, sorted(options))) return DF.Flow( DF.add_field( field, 'array', lambda row: [ t.split('_', 1)[1] for t in row['tags'] if any( t.startswith('{}_'.format(prefix)) for prefix in prefixes) ]), remove_prefix, collect) else: def verify_tags(row): for tag in row[field]: if '_' in tag: print('Found prefix: {}'.format(tag)) return DF.Flow(verify_tags, )
def get_neighborhood_features(): return DF.Flow( DF.load('neighborhoods.xlsx', name='stat-areas', deduplicate_headers=True), DF.add_field( 'neighborhoods', 'array', lambda r: [v for k, v in r.items() if v and k.startswith('neighborhood')]), DF.add_field('geometry', 'object', lambda r: geometries[r['stat-area']]), DF.concatenate( dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])), DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(), DF.join_with_self( 'stat-areas', ['neighborhood'], dict( neighborhood=None, stat_areas=dict(name='stat_area', aggregate='array'), geometries=dict(name='geometry', aggregate='array'), )), DF.add_field('geometry', 'object', lambda r: unite_geometries(r['geometries'])), DF.delete_fields(['geometries']), DF.update_resource(-1, name='neighborhoods'), DF.add_field( 'properties', 'object', lambda r: dict( x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])), DF.delete_fields(['neighborhood', 'stat_areas']), DF.checkpoint('_cache_neighborhoods')).results()[0][0]
def es_dumper(resource_name, revision, path): now = time.time() return DF.Flow( update_pk('doc_id'), DF.add_field('revision', 'integer', default=revision), DF.add_field('score', 'number', default=1), DF.add_field('create_timestamp', 'number', now), my_dump_to_es(indexes={ 'migdar__' + resource_name: [{ 'resource-name': resource_name, 'revision': revision }] }, mapper_cls=BoostingMappingGenerator, index_settings={'index.mapping.coerce': True}, elasticsearch_options=dict(timeout=60)), DF.dump_to_path('data/{}'.format(path)), collate(revision), my_dump_to_es(indexes={ 'migdar__docs': [{ 'resource-name': resource_name, 'revision': revision }] }, mapper_cls=BoostingMappingGenerator, index_settings={'index.mapping.coerce': True}), DF.update_resource(None, **{'dpp:streaming': True}), DF.printer(), )
def flow(parameters, *_): return DF.Flow( get_all(), DF.set_type('report-date', type='date', format='%Y-%m-%dT%H:%M:%SZ'), DF.update_resource(-1, **parameters['target-resource']), DF.update_resource(-1, **{'dpp:streaming': True}), )
def category(): CATEGORIES = { 'אולם מופעים, היכל תרבות': 'תרבות', 'אולם ספורט, התעמלות': 'ספורט', 'איצטדיון': 'ספורט', 'בית כנסת': 'דת', 'בית ספר': 'חינוך', 'בריכת שחייה ציבורית או עירונית': 'ספורט', 'גן ילדים': 'חינוך', 'טיפת חלב': 'בריאות', 'לשכת רווחה של הרשות המקומית': 'רווחה', 'מבנה עירייה': 'כלליים', 'מגרש ספורט': 'ספורט', 'מגרש ציבורי פנוי': 'כלליים', 'מועדון נוער': 'קהילה', 'מועדון קהילתי כולל מרכז צעירים': 'קהילה', 'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'קהילה', 'מעון יום': 'חינוך', 'מקווה טוהרה': 'דת', 'מקלט': 'כלליים', 'מרפאה': 'בריאות', 'ספרייה': 'תרבות', 'פנימייה, כפר נוער': 'חינוך', } def cat(row): row['category'] = CATEGORIES[row['kind']] return DF.Flow(DF.add_field('category', 'string'), cat)
def process_resource(instance_name, package, resource, package_extras_processed_res): lat_field = resource.get("geo_lat_field") lon_field = resource.get("geo_lon_field") features = [] for row in DF.Flow( DF.load(resource['url'], infer_strategy=DF.load.INFER_STRINGS)).results()[0][0]: properties = get_properties(row) lon, lat = get_lat_lon_values(row, lon_field, lat_field) if lon and lat: features.append( Feature(geometry=Point((lon, lat)), properties=properties)) fc = FeatureCollection(features) with utils.tempdir() as tmpdir: with open(os.path.join(tmpdir, "data.geojson"), 'w') as f: geojson.dump(fc, f) with open(os.path.join(tmpdir, "data.geojson")) as f: ckan.resource_create( instance_name, { 'package_id': package['id'], 'description': resource['description'], 'format': 'GeoJSON', 'name': resource['name'].replace('.csv', '') + '.geojson', }, files=[('upload', f)]) common.update_package_extras(instance_name, package, package_extras_processed_res)
def prepare_locations(): prepare_addresses() return DF.Flow( DF.load('_cache_addresses/datapackage.json'), DF.add_field( 'address', 'string', lambda r: '{} {}{}'.format( r['street_name'], r['house_number'], r['letter'] or '')), DF.add_field( 'item', 'object', lambda r: dict(value=dict(lat=float(r['lat']), lon=float(r['lon']), arnona_zones=r['arnona_zones'], שם=r['address']), display=r['address'])), DF.sort_rows('{house_number}'), DF.delete_fields([ 'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address' ]), DF.join_with_self( 'concat', ['street_name'], dict(display=dict(name='street_name'), items=dict(name='item', aggregate='array'))), DF.add_field('sort_street_address', 'string', lambda r: sort_street_address(r['display'])), DF.sort_rows('{sort_street_address}'), DF.delete_fields(['sort_street_address']), DF.printer(), DF.dump_to_path('_cache_locations'), DF.checkpoint('_cache_locations')).results()[0][0]
def test_basic_flow_no_mapping_type(): data = [ dict(key='key%04d' % x, value=x) for x in range(1000) ] conn_str = 'localhost:9200' DF.Flow( data, DF.update_resource(-1, name='data'), DF.set_primary_key(['key']), dump_to_es( engine=conn_str, indexes=dict( test_basic_flow_no_mapping_type=[dict( resource_name='data' )] ) ), ).process() time.sleep(1) out = list(Storage(Elasticsearch(hosts=[conn_str])).read('test_basic_flow_no_mapping_type')) assert data == sorted(out, key=lambda r: r['key'])
def prepare(): for resource_name, load in loads: DF.Flow( load, # DF.printer(tablefmt='html'), DF.concatenate( FIELD_MAPPING, dict(name=resource_name, path=resource_name + '.csv')), DF.set_type('activity_name', type='string', constraints=dict(required=True), on_error=DF.schema_validator.drop), DF.set_type('allocated_budget', type='number', groupChar=',', bareNumber=False), DF.set_type('num_beneficiaries', type='number', groupChar=',', bareNumber=False, on_error=DF.schema_validator.ignore), fix_beneficiaries, DF.set_type('num_beneficiaries', type='string'), multiply_budget, fill_org_hierarchy, # DF.printer(tablefmt='html'), DF.dump_to_path('tmp/' + resource_name), ).process()
def datarecords(kind): return map( lambda r: r['value'], DF.Flow( DF.load(f'https://data-input.obudget.org/api/datarecords/{kind}', format='json', property='result')).results()[0][0])
def flow(*_): return DF.Flow( scraper(), DF.update_resource(-1, name='joint', path='joint.csv', **{'dpp:streaming': True}))
def func(rows): filename = 'קטלוג רווחה למערכת ההזנה 18.5.21.xlsx' cats = DF.Flow( DF.load(filename, name='welfare'), # DF.printer(), DF.rename_fields( { 'id': 'catalog_number', 'שם השירות (ציבורי)': 'name' }, regex=False), DF.select_fields(['catalog_number', 'name']), ).results()[0][0] cats = dict((k.pop('name'), k) for k in cats) missing = [] for row in rows: v = row['value'] if v['office'] == 'משרד הרווחה': name = v['name'] if name in cats: rec = cats.pop(name) cn = str(rec['catalog_number']) if v.get('catalog_number') != cn: v['catalog_number'] = cn yield row else: missing.append((name, v['id'])) for x in missing: print( '{} (https://data-input.obudget.org/he/datarecords/social_service/{})' .format(*x))
def flow(*_, path='data/datasets_in_es'): return DF.Flow( DF.load('{}/datapackage.json'.format(path)), do_screenshot(), write_excel(), DF.update_resource(-1, **{'dpp:streaming': True}) )
def operator(name, params): connection_string = params['db_url'] source_table = params['db_table'] target_instance_name = params['target_instance_name'] target_package_id = params['target_package_id'] target_organization_id = params['target_organization_id'] print('starting db_fetcher operator') print( 'source_table={} target_instance_name={} target_package_id={} target_organization_id={}' .format(source_table, target_instance_name, target_package_id, target_organization_id)) with tempfile.TemporaryDirectory() as tempdir: csv_filename = target_package_id + '.csv' DF.Flow( DF.load(connection_string, table=source_table, name=target_package_id, infer_strategy=DF.load.INFER_PYTHON_TYPES), DF.update_resource(-1, path=csv_filename), DF.delete_fields(['_source']), DF.dump_to_path(tempdir)).process() csv_filename = os.path.join(tempdir, csv_filename) print('{}, {:,} bytes'.format(csv_filename, os.stat(csv_filename).st_size)) update_package(target_instance_name, target_organization_id, target_package_id, target_package_id, [('CSV', csv_filename)])
def flow(*_): global gcd gcd = google_chrome_driver(wait=False) return DF.Flow( scraper(gcd), DF.update_resource(-1, **{'dpp:streaming': True}), finalize_teardown(gcd), )
def proj(): def func(row): row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True) return DF.Flow( DF.add_field('lon', 'number'), DF.add_field('lat', 'number'), func, DF.delete_fields(['X', 'Y']) )
def add_source(): def f(rows): for row in rows: row['source'] = rows.res.name yield row return DF.Flow( DF.add_field('source', 'string'), f )
def flow(*_): return DF.Flow( DF.load('/var/datapackages/activities/social_services/datapackage.json'), DF.add_field('entity_id', 'string'), DF.add_field('soproc_supplier', 'boolean'), unwind(), DF.select_fields(['entity_id', 'soproc_supplier']), DF.dump_to_path('/var/datapackages/activities/social_services_suppliers'), )
def unwind_neighborhoods(): def f(rows): for row in rows: for n in row['neighborhoods']: row['neighborhood'] = n yield row return DF.Flow(DF.add_field('neighborhood', 'string'), f, DF.delete_fields(['neighborhoods']))
def flow(*_): return DF.Flow( [ dict(office=office, kind=kind) for office in offices for kind in report_kinds ], do_query(), DF.printer(), DF.update_resource(-1, **{'dpp:streaming': True}), )
def flow(*_): global engine engine = create_engine(os.environ['DPP_DB_ENGINE']) return DF.Flow( DF.add_field('charts', 'array', default=[], **{ 'es:itemType': 'object', 'es:index': False }), fetch_extra_data)
def flow(parameters, *_): return DF.Flow( *[ DF.load(x, format='csv', name='res%d' % i, infer_strategy=DF.load.INFER_STRINGS, cast_strategy=DF.load.CAST_DO_NOTHING) for i, x in enumerate(wrapper(parameters['year'])) ], DF.update_resource(None, **{'dpp:streaming': True}) )
def geo(): def proj(row): row['lon'], row['lat'] = projector(row['lon'], row['lat'], inverse=True) return DF.Flow( proj, DF.set_type('lon', type='number'), DF.set_type('lat', type='number'), )
def lang_flow(lang, prefix): tags = [dict(doc_id=list(k)) for k in sorted(set( (prefix, x['hebrew'], x[lang]) for x in translations['tags'].values() ))] def add_url(prefix_): def func(rows): for row in rows: if 'url' not in row: yield row elif row.get('doc_id'): row['url'] = 'https://yodaat.org/{}item/{}'.format(prefix_, row['doc_id']) yield row else: print('MMMMMMMM MISSING DOC ID', row) return DF.Flow( DF.add_field('url', 'string', resources=-1), func, ) return DF.Flow( *[ DF.Flow( DF.load('https://api.yodaat.org/data/{}_in_es/data/{}.csv'.format(x, y), name='{}-{}'.format(x, lang)), add_url(prefix) ) for x, y in [ ('publications', 'publications'), ('orgs', 'orgs'), ('datasets', 'out') ] ], tags, DF.add_field('url', 'string', lambda row: 'https://yodaat.org/{}search?tag={}&itag={}&kind=all&filters={{}}&sortOrder=-year'.format(*row.get('doc_id')), resources=-1), DF.update_resource(-1, name='tags-{}'.format(lang)), )
def flow(*_): return DF.Flow( *[ DF.load(x, encoding='windows-1255', format='csv', name='res%d' % i, quoting=csv.QUOTE_NONE, infer_strategy=DF.load.INFER_STRINGS, cast_strategy=DF.load.CAST_DO_NOTHING) for i, x in enumerate(wrapper()) ], DF.update_resource(None, **{'dpp:streaming': True}))
def flow(*_, ): return DF.Flow( scrape(), DF.update_resource(-1, **{ 'name': 'class_action', 'dpp:streaming': True }), DF.set_type('claim_date', type='datetime', format='%d/%m/%Y', resources=-1), calculate_publication_id(8), )