def judges_flow(out_path): return Flow( get_tribunals(), update_resource(['res_1'], name='tribunals', path='tribunals.csv'), checkpoint('judges_tribunals'), get_judges(), update_resource(['res_2'], name='judges_list', path='judges_list.csv'), set_type('Is_In_Dimus_List', resources=['judges_list'], type='boolean'), checkpoint('judges_judges_list'), join('tribunals', ['Tribunal_Code'], 'judges_list', ['Tribunal_Code'], fields={ 'Tribunal_Type_Code': {}, 'Tribunal_Arkaa_Code': { 'name': 'Arkaa_Code' }, 'Tribunal_District_Code': { 'name': 'District_Code' }, 'Tribunal_Name': { 'name': 'Name' } }), fetch_judges_details, checkpoint('judges_details'), add_field('tribunal_type_name', 'string'), parse_judges_extra_details, checkpoint('judges_extra_details'), parse_judge_events, dump_to_path(out_path), printer(num_rows=1))
def test_load_dates_timezones(): from dataflows import Flow, checkpoint from datetime import datetime, timezone import shutil dates = [ datetime.now(), datetime.now(timezone.utc).astimezone() ] shutil.rmtree('.checkpoints/test_load_dates_timezones', ignore_errors=True) Flow( [{'date': d.date(), 'datetime': d} for d in dates], checkpoint('test_load_dates_timezones') ).process() results = Flow( checkpoint('test_load_dates_timezones') ).results() assert list(map(lambda x: x['date'], results[0][0])) == \ list(map(lambda x: x.date(), dates)) assert list(map(lambda x: x['datetime'], results[0][0])) == \ list(map(lambda x: x, dates))
def test_load_from_checkpoint(): from dataflows import Flow, checkpoint import shutil shutil.rmtree('.checkpoints/test_load_from_checkpoint', ignore_errors=True) assert Flow([{ 'foo': 'bar' }], checkpoint('test_load_from_checkpoint')).process() assert Flow(checkpoint('test_load_from_checkpoint')).results()[0] == [[{ 'foo': 'bar' }]]
def prepare_locations(): prepare_addresses() return DF.Flow( DF.load('_cache_addresses/datapackage.json'), DF.add_field( 'address', 'string', lambda r: '{} {}{}'.format( r['street_name'], r['house_number'], r['letter'] or '')), DF.add_field( 'item', 'object', lambda r: dict(value=dict(lat=float(r['lat']), lon=float(r['lon']), arnona_zones=r['arnona_zones'], שם=r['address']), display=r['address'])), DF.sort_rows('{house_number}'), DF.delete_fields([ 'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address' ]), DF.join_with_self( 'concat', ['street_name'], dict(display=dict(name='street_name'), items=dict(name='item', aggregate='array'))), DF.add_field('sort_street_address', 'string', lambda r: sort_street_address(r['display'])), DF.sort_rows('{sort_street_address}'), DF.delete_fields(['sort_street_address']), DF.printer(), DF.dump_to_path('_cache_locations'), DF.checkpoint('_cache_locations')).results()[0][0]
def post_flow(phase, poster, tasks, config: Config, cache=False): if cache: config = config._unflatten() config_json = [config.get('source'), config.get('structure')] config_json = json.dumps(config_json, sort_keys=True) print(config_json[:64], len(config_json)) checkpoint_name = hashlib.md5(config_json.encode('utf8')).hexdigest() if config.get('source'): path = config.get('source').get('path') if path: checkpoint_name += '_' + os.path.basename(path) cache = [checkpoint(checkpoint_name)] else: cache = [] steps = [ row_validator(phase, poster, tasks) ] + cache + [ row_sender(phase, poster, tasks) ] return Flow( *steps )
def run_data_count_flow(): assert Flow( get_data_count_views(), checkpoint('test_checkpoint'), ).results()[0] == [[{ 'foo': 'bar' }]]
def get_neighborhood_features(): return DF.Flow( DF.load('neighborhoods.xlsx', name='stat-areas', deduplicate_headers=True), DF.add_field( 'neighborhoods', 'array', lambda r: [v for k, v in r.items() if v and k.startswith('neighborhood')]), DF.add_field('geometry', 'object', lambda r: geometries[r['stat-area']]), DF.concatenate( dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])), DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(), DF.join_with_self( 'stat-areas', ['neighborhood'], dict( neighborhood=None, stat_areas=dict(name='stat_area', aggregate='array'), geometries=dict(name='geometry', aggregate='array'), )), DF.add_field('geometry', 'object', lambda r: unite_geometries(r['geometries'])), DF.delete_fields(['geometries']), DF.update_resource(-1, name='neighborhoods'), DF.add_field( 'properties', 'object', lambda r: dict( x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])), DF.delete_fields(['neighborhood', 'stat_areas']), DF.checkpoint('_cache_neighborhoods')).results()[0][0]
def main_flow(prefix, operator): return Flow( cluster_info(operator), update_resource(['res_1'], name='cluster-info', path='cluster-info.csv'), checkpoint(f'{prefix}-cluster-info'), ckan_cloud_instances(operator), update_resource(['res_2'], name='ckan-cloud-instances', path='ckan-cloud-instances.csv'), )
def dump_print_flow(flow, dump_path, num_rows=1, fields=None, checkpoint_name=None): return Flow(flow, checkpoint(checkpoint_name) if checkpoint_name else None, dump_to_path(dump_path), printer(num_rows=num_rows, fields=fields))
def prepare(self): self.ref_hash = md5(self.REF_DATAPACKAGE.encode('utf8')).hexdigest() self.key = self.__class__.__name__ check = checkpoint(self.ref_hash) if not check.exists(): Flow(load(self.REF_DATAPACKAGE), rename_last_resource(self.ref_hash), dump_to_path('.cache/{}'.format(self.ref_hash)), check).process() logger.debug('DONE PREPARING %s', self.key)
def AFRR_Data(): unpivoting_fields = [{ 'name': 'aFRR_DownActivated', 'keys': { 'product': 'aFRR_DownActivated' } }, { 'name': 'aFRR_UpActivated', 'keys': { 'product': 'aFRR_UpActivated' } }] extra_keys = [{'name': 'product', 'type': 'string'}] extra_value = {'name': 'amount', 'type': 'number'} flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000', format="json", property="result.records", name="fact_afrr"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('afrr'), # Normalize/unpivot: unpivot(unpivoting_fields, extra_keys, extra_value), add_computed_field([ dict(target=dict(name='PriceArea', type='string'), operation='constant', with_='DK1'), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_='dummy'), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_='dummy') ]), add_price, delete_fields(fields=[ 'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK', 'aFRR_UpPriceEUR' ]), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_afrr', title='Automatic Frequency Restoration Reserves', source= 'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c' ), printer(), dump_to_path('afrr_data')) flow.process()
def prepare_addresses(): with tempfile.NamedTemporaryFile(suffix='.csv', mode='wb') as source: shutil.copyfileobj(fetch_ckan('addresses', 'CSV'), source) source.flush() DF.Flow( DF.load(source.name), DF.concatenate( dict(street_name=['streetName'], house_number=['HouseNuber'], letter=[], lat=[], lon=[])), match_arnona(), DF.dump_to_path('_cache_addresses'), DF.checkpoint('_cache_addresses')).process()
def update_dataset(): flow = Flow( # Load inputs load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), # Process them (if necessary) # Save the results add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''), printer(), dump_to_path(), ) flow.process()
def Elspot_Prices_Data(): # field_metadata = get_metadata('c86859d2-942e-4029-aec1-32d56f1a2e5d') flow = Flow( # Load inputs - using 'datastore_search_sql' API load last 10k rows: load( 'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20elspotprices%20order%20by%20"HourUTC"%20desc%20limit%20100', format="json", property="result.records", name="fact_elspot_prices"), # Remove extra fields: delete_fields(fields=['_id', '_full_text', 'HourDK']), # Save the results checkpoint('load_data'), # Add product: add_computed_field([ dict(target=dict(name='product', type='string'), operation='constant', with_='Elspot'), dict(target=dict(name='amount', type='number'), operation='constant', with_=1), dict(target=dict(name='PriceDKK', type='number'), operation='constant', with_=-1), dict(target=dict(name='PriceEUR', type='number'), operation='constant', with_=-1) ]), add_price, delete_fields(fields=['SpotPriceDKK', 'SpotPriceEUR']), add_metadata(name='marketdata', title='Marketdata prototype'), update_resource(resources=None, mediatype='text/csv'), update_resource( resources='fact_elspot_prices', title='Elspot Prices Data', source= 'https://www.energidataservice.dk/dataset/elspotprices/resource_extract/c86859d2-942e-4029-aec1-32d56f1a2e5d' ), printer(), dump_to_path('elspot_prices_data'), # dump_to_sql(tables={'elspot': {'resource-name': 'Elspot_Prices_Data', 'mode': 'append'}}, engine='postgresql://*****:*****@localhost/cubes') ) flow.process()
continue if row.get("Country/Region" ) == "Canada" and not row.get("Province/State"): row["Province/State"] = "Recovery aggregated" row["Lat"] = row.get("Lat", "56.1304") row["Long"] = row.get("Long", "-106.3468") yield {**expected, **row} Flow( load(f"{BASE_URL}{CONFIRMED}"), load(f"{BASE_URL}{RECOVERED}"), load(f"{BASE_URL}{DEATH}"), load(f"{BASE_URL}{CONFIRMED_US}"), load(f"{BASE_URL}{DEATH_US}"), checkpoint("load_data"), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ "name": "Date", "patterns": [{ "find": "/", "replace": "-" }] }]), to_normal_date, set_type("Date", type="date", format="%d-%m-%y", resources=None), set_type("Case", type="number", resources=None), join( source_name="time_series_covid19_confirmed_global", source_key=["Province/State", "Country/Region", "Date"], source_delete=True,
def kubectl_get_volumes_flow(source_resource_name='kubectl_get_all', resource_name='kubectl_get_volumes', get_all_checkpoint_name=None): volume_object_fields = [ 'hostPath', 'secret', 'configMap', 'emptyDir', 'gcePersistentDisk', 'nfs' ] def get_volumes(rows): for row in rows: volumes = row.get('volumes') for volume in (volumes if volumes else []): yield { 'name': volume.pop('name'), 'source_name': row['name'], 'source_kind': row['kind'], 'source_namespace': row['namespace'], **{ field: volume.pop(field, None) for field in volume_object_fields }, } assert len(volume) == 0, volume def add_volumes(package): package.pkg.remove_resource(source_resource_name) package.pkg.add_resource({ 'name': resource_name, 'path': f'{resource_name}.csv', 'schema': { 'fields': [ { 'name': 'name', 'type': 'string' }, { 'name': 'source_kind', 'type': 'string' }, { 'name': 'source_name', 'type': 'string' }, { 'name': 'source_namespace', 'type': 'string' }, *[{ 'name': field, 'type': 'object' } for field in volume_object_fields], ] } }) yield package.pkg for rows in package: if rows.res.name == source_resource_name: yield get_volumes(rows) def filter_volumes(rows): if rows.res.name == resource_name: for row in rows: if row['source_namespace'] == 'kube-system': continue if any((row.get(f) or row.get(f) == {}) for f in ['secret', 'configMap', 'emptyDir']): continue assert row.get('nfs', None) or row.get('gcePersistentDisk', None), row yield row else: yield from rows return Flow( kubectl_get_all_flow(), checkpoint(get_all_checkpoint_name) if get_all_checkpoint_name else None, add_volumes, filter_volumes)
unpivoting_fields = [{ 'name': '([0-9]+\/[0-9]+\/[0-9]+)', 'keys': { 'Date': r'\1' } }] extra_keys = [{'name': 'Date', 'type': 'string'}] extra_value = {'name': 'Case', 'type': 'number'} Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ 'name': 'Date', 'patterns': [{ 'find': '/', 'replace': '-' }] }]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_19-covid-Confirmed', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_19-covid-Deaths',
"title": "Population Growth - World Projections (High Fertility)" } ], readme=readme() ), load(source_url,format='xlsx',sheet='ESTIMATES',headers=17), load(source_url,format='xlsx',sheet='LOW VARIANT',headers=17), load(source_url,format='xlsx',sheet='MEDIUM VARIANT',headers=17), load(source_url,format='xlsx',sheet='HIGH VARIANT',headers=17), load(source_url,format='xlsx',sheet='CONSTANT-FERTILITY',headers=17), load(source_url,format='xlsx',sheet='CONSTANT-MORTALITY',headers=17), load(source_url,format='xlsx',sheet='INSTANT-REPLACEMENT',headers=17), load(source_url,format='xlsx',sheet='MOMENTUM',headers=17), load(source_url,format='xlsx',sheet='ZERO-MIGRATION',headers=17), load(source_url,format='xlsx',sheet='NO CHANGE',headers=17), checkpoint('loaded'), delete_fields(fields=['Index', 'Variant', 'Notes']), rename_resources, unpivot( unpivot_fields=[{'name': '([0-9]{4})', 'keys': {'year': '\\1'}}], extra_keys=[{'name': 'year', 'type': 'year'}], extra_value={'name': 'population', 'type': 'number'}, resources='population-estimates' ), unpivot( unpivot_fields=[{'name': '([0-9]{4})', 'keys': {'year': '\\1'}}], extra_keys=[{'name': 'year', 'type': 'year'}], extra_value={'name': 'population', 'type': 'number'}, resources=resource_names[1:] ), add_computed_field([
if dry_run: print('dry run - {} -- {}'.format(file_page_title, page_text)) else: print('uploading {}'.format(file_page_title)) site = pywikibot.Site() site.login() page = pywikibot.FilePage(site, file_page_title) assert page.site.family == 'commons', 'invalid page site: {}'.format(page.site) with throttle(): if not page.exists(): page.text = page_text with tempfile.NamedTemporaryFile() as f: f.write(requests.get(row['image']).content) if page.upload(f.name, comment="uploaded by wmilbot", ignore_warnings=True): print("----- {} uploaded successfully".format(row['image'])) else: raise Exception("Upload failed") else: page.get() page.text = page_text page.save(summary='update by wmilbot') print('----- {} updated successfully'.format(row['image'])) Flow( checkpoint('scraped-site-filtered-years-album-images', checkpoint_path='btm/.checkpoints'), add_field('year', 'year'), get_years, upload ).process()
row.get('Province/State') == 'Recovered' and not \ row.get('Recovered'): continue if row.get('Country/Region' ) == 'Canada' and not row.get('Province/State'): row['Province/State'] = 'Recovery aggregated' row['Lat'] = row.get('Lat', '56.1304') row['Long'] = row.get('Long', '-106.3468') yield {**expected, **row} Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{ 'name': 'Date', 'patterns': [{ 'find': '/', 'replace': '-' }] }]), to_normal_date, set_type('Date', type='date', format='%d-%m-%y', resources=None), set_type('Case', type='number', resources=None), join(source_name='time_series_covid19_confirmed_global', source_key=['Province/State', 'Country/Region', 'Date'], source_delete=True, target_name='time_series_covid19_deaths_global',
def dump_print_flow(flow, dump_path, checkpoint_name=None, **kwargs): return Flow(flow, checkpoint(checkpoint_name) if checkpoint_name else None, dump_to_path(dump_path), printer(**kwargs))
def parse_dockerfiles(): gitlab_repos = {} def _parse_gitlab_repos(rows): if rows.res.name == 'ckan-cloud-instances': for row in rows: gitlab_repo = row['gitlab_repo'] if gitlab_repo in gitlab_repos: gitlab_repos[gitlab_repo]['instances'].append(row) else: gitlab_repos[gitlab_repo] = {'instances': [row]} yield row else: yield from rows def _get_dockerfile_from(dockerfile): if dockerfile: return [ line.replace('FROM ', '') for line in dockerfile.split('\n') if line.startswith('FROM') ][0] else: return None def _parse_ckan_extensions(rows): if rows.res.name == 'dockerfiles': for row in rows: row['ckan_exts'] = [] if row['dockerfile']: for line in row['dockerfile'].split('\n'): if 'https://github.com/' in line and '.git@' in line and '#egg=' in line: ext = line.split('https://github.com/')[1].split( '#egg=')[0].replace('.git@', '@') row['ckan_exts'].append(ext) if 'ckanext-s3filestore' in ext: row['ckanext-s3filestore'] = ext yield row else: yield from rows def _get_dockerfile_row(gitlab_repo_name, gitlab_repo): try: dockerfile = CkanGitlab()._get_file(gitlab_repo_name, 'Dockerfile') except Exception: dockerfile = None return { 'gitlab_repo': gitlab_repo_name, 'instances': [i['name'] for i in gitlab_repo['instances']], 'from': _get_dockerfile_from(dockerfile), 'dockerfile': dockerfile } def _parse_dockerfiles(package): package.pkg.add_resource({ 'name': 'dockerfiles', 'path': 'dockerfiles.csv', 'schema': { 'fields': [{ 'name': 'gitlab_repo', 'type': 'string' }, { 'name': 'instances', 'type': 'array' }, { 'name': 'from', 'type': 'string' }, { 'name': 'dockerfile', 'type': 'string' }] } }) yield package.pkg yield from package yield (_get_dockerfile_row(gitlab_repo_name, gitlab_repo) for gitlab_repo_name, gitlab_repo in gitlab_repos.items()) return Flow( _parse_gitlab_repos, _parse_dockerfiles, checkpoint('ckan_images_dockerfiles'), add_field('ckan_exts', 'array'), add_field('ckanext-s3filestore', 'string'), _parse_ckan_extensions, )