def func(rows): filename = 'קטלוג רווחה למערכת ההזנה 18.5.21.xlsx' cats = DF.Flow( DF.load(filename, name='welfare'), # DF.printer(), DF.rename_fields( { 'id': 'catalog_number', 'שם השירות (ציבורי)': 'name' }, regex=False), DF.select_fields(['catalog_number', 'name']), ).results()[0][0] cats = dict((k.pop('name'), k) for k in cats) missing = [] for row in rows: v = row['value'] if v['office'] == 'משרד הרווחה': name = v['name'] if name in cats: rec = cats.pop(name) cn = str(rec['catalog_number']) if v.get('catalog_number') != cn: v['catalog_number'] = cn yield row else: missing.append((name, v['id'])) for x in missing: print( '{} (https://data-input.obudget.org/he/datarecords/social_service/{})' .format(*x))
def flow(*_): DF.Flow( DF.load(filename, name='welfare'), DF.add_field('activity_name', 'string', lambda r: r['שם השירות (ציבורי)']), DF.filter_rows(lambda r: r['activity_name']), DF.add_field( 'activity_description', 'array', lambda r: [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)'] ]), DF.add_field( 'history', 'array', lambda r: [ dict( year=2019, unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(), subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1]. strip(), subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[ 1].strip(), ) ]), DF.add_field('target_audience', 'array', splitter('אוכלוסייה')), DF.add_field('subject', 'array', splitter('תחום ההתערבות')), DF.add_field('intervention', 'array', splitter('אופן התערבות')), DF.select_fields(FIELDS), DF.add_field('publisher_name', 'string', 'משרד הרווחה'), DF.add_field('min_year', 'integer', 2019), DF.add_field('max_year', 'integer', 2019), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(), DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process() return DF.Flow( DF.load('tmp/activities-welfare/datapackage.json'), DF.update_resource(-1, **{'dpp:streaming': True}), )
def test_select_field(): from dataflows import select_fields f = Flow(data, select_fields(['y'])) results, dp, _ = f.results() for i in results[0]: assert list(i.keys()) == ['y'] assert dp.descriptor['resources'][0]['schema']['fields'] == \ [dict(name='y', type='string', format='default')]
def flow(*_): return DF.Flow( DF.load('/var/datapackages/activities/social_services/datapackage.json'), DF.add_field('entity_id', 'string'), DF.add_field('soproc_supplier', 'boolean'), unwind(), DF.select_fields(['entity_id', 'soproc_supplier']), DF.dump_to_path('/var/datapackages/activities/social_services_suppliers'), )
def build_school_cache(self): results, _, _ = Flow( load( 'https://datacity-source-files.fra1.digitaloceanspaces.com/69-AlQasum/Education/Schools.xlsx', headers=1, infer_strategy=load.INFER_STRINGS, cast_strategy=load.CAST_TO_STRINGS), select_fields(['SEMEL_MOSA', 'X', 'Y']), set_type('SEMEL_MOSA', type='string'), set_type('X', type='number'), set_type('Y', type='number'), ).results() ret = dict((x['SEMEL_MOSA'], (x['X'], x['Y'])) for x in results[0]) return ret
def flow(*_): return DF.Flow( DF.load( '/var/datapackages/activities/social_services/datapackage.json'), DF.add_field('tender_id', 'string'), DF.add_field('publication_id', 'string'), DF.add_field('tender_type', 'string'), DF.add_field('tender_key', 'string'), DF.add_field('soproc_tender', 'boolean'), unwind(), DF.select_fields([ 'tender_id', 'publication_id', 'tender_type', 'tender_key', 'soproc_tender' ]), DF.dump_to_path( '/var/datapackages/activities/social_services_tenders'), )
def flow(*_): with open('data/sitemap.xml', 'w') as index: index.write("""<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n""") for kind in ('publications', 'orgs', 'datasets', 'tags'): for lang in ('hebrew', 'english', 'arabic'): index.write("""<sitemap><loc>https://api.yodaat.org/data/sitemap.{}-{}.xml</loc></sitemap>\n""".format(kind, lang)) index.write("""</sitemapindex>""") return DF.Flow( lang_flow('hebrew', ''), lang_flow('english', 'en/'), lang_flow('arabic', 'ar/'), registerSiteMaps, DF.select_fields(['url']), DF.update_resource(None, **{'dpp:streaming': True}), DF.printer() )
fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' })), add_computed_field(target={ 'name': 'Deaths', 'type': 'number' }, operation='format', with_='{Case}'), add_computed_field(target={ 'name': 'Country', 'type': 'string' }, operation='format', with_='{Country/Region}'), add_computed_field(target={ 'name': 'Province', 'type': 'string' }, operation='format', with_='{Province/State}'), delete_fields(['Case', 'Country/Region', 'Province/State']), update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'), select_fields([ 'Province', 'Country', 'Lat', 'Long', 'Date', 'Confirmed', 'Recovered', 'Deaths' ]), dump_to_path()).results()[0]
prop['type'] = 'keyword' elif schema_type in ('number', 'integer'): prop['index'] = True return prop if __name__ == '__main__': DF.Flow( DF.load('new-york-city-current-job-postings.zip', filename='nyc-jobs.csv', name='jobs'), DF.add_field('doc_id', 'string', default=lambda row: 'job/{Job ID}'.format(**row)), DF.add_field('score', 'integer', default=1), DF.set_type('Salary Frequency', **{'es:keyword': True}), DF.set_primary_key(['doc_id']), dump_to_es(indexes={'jobs-job': [{ 'resource-name': 'jobs', }]}, mapper_cls=SampleMappingGenerator), DF.dump_to_path('data'), DF.add_field('value', 'object', default=lambda row: dict((k, v) for k, v in row.items() if k not in ('doc_id', 'score')), **{'es:index': False}), DF.select_fields(['doc_id', 'value']), dump_to_es(indexes={'jobs-document': [{ 'resource-name': 'jobs', }]}), DF.printer(fields=['doc_id'])).process()
def Olap_Datapackage(): flow = Flow( # Load datapackages: load('elspot_prices_data/datapackage.json'), load('afrr_data/datapackage.json'), load('fcr_dk1_data/datapackage.json'), concatenate(fields={ 'Timestamp': ['HourUTC'], 'Area': ['PriceArea'], 'Product': ['product'], 'Amount': ['amount'], 'Price_DKK': ['PriceDKK'], 'Price_EUR': ['PriceEUR'] }, target={ 'name': 'fact', 'path': 'data/fact.csv' }), add_computed_field( [dict(target='id', operation='constant', with_='dummy')]), add_id, set_type('id', type='integer'), set_primary_key(primary_key=['id']), # Reorder so that 'id' column is the first: select_fields([ 'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK', 'Price_EUR' ], resources='fact'), # Add foreign keys: add_foreign_keys, # Fact table is ready. Now duplicate the resource to generate dim tables: # First is 'time' table: duplicate(source='fact', target_name='time', target_path='time.csv'), select_fields(['Timestamp'], resources=['time']), join_self(source_name='time', source_key=['Timestamp'], target_name='time', fields={'Timestamp': {}}), # Parse datetime fields and add a separate field for year, month and day: add_computed_field([ dict(target=dict(name='day', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d' )), dict(target=dict(name='month', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m' )), dict(target=dict(name='month_name', type='string'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B' )), dict(target=dict(name='year', type='year'), operation=lambda row: datetime.strptime( row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y' )), ], resources=['time']), set_primary_key(primary_key=['Timestamp'], resources=['time']), # Now 'area' table: duplicate(source='fact', target_name='area', target_path='area.csv'), select_fields(['Area'], resources=['area']), join_self(source_name='area', source_key=['Area'], target_name='area', fields={'Area': {}}), set_primary_key(primary_key=['Area'], resources=['area']), # Now 'product' table: duplicate(source='fact', target_name='product', target_path='product.csv'), select_fields(['Product'], resources=['product']), join_self(source_name='product', source_key=['Product'], target_name='product', fields={'Product': {}}), set_primary_key(primary_key=['Product'], resources=['product']), dump_to_path('olap_datapackage')) flow.process()
if row.get(k) ) values = list(set(values)) return values translations = {} for source, gid in sources.items(): url = URL.format(gid) translations[source] = DF.Flow( DF.load(url), clean_row, DF.add_field('values', 'array', default=extract_values), DF.filter_rows(lambda row: row['hebrew']), DF.select_fields(list(LANGS) + ['values']) ).results()[0][0] tx = {} complained = set() for row in translations[source]: v = row.get('values') if not v: continue for vv in v: vv = clean(vv) if tx.get(vv) not in (None, row): if vv not in complained: complained.add(vv) tx[vv] = row if len(complained) > 0: print('{}:'.format(source))
import dataflows as DF import glob x = DF.Flow( ({'filename': x[:-4]} for x in glob.glob('*png')), DF.update_resource(-1, name='files'), DF.load('http://api.yodaat.org/data/orgs_in_es/data/orgs.csv', name='orgs'), DF.join( 'files', '{filename}', 'orgs', '{entity_id}', { 'filename': {}, }, full=True, source_delete=True ), DF.filter_rows(lambda row: row['filename'] is None), DF.select_fields(['org_name', 'entity_id']), DF.printer() ).process()