def spew_flow(flow, ctx: ProcessorContext): flow = Flow( update_package(**ctx.datapackage), load((ctx.datapackage, ctx.resource_iterator)), flow, ) datastream = flow.datastream() ctx.datapackage = datastream.dp.descriptor ctx.resource_iterator = datastream.res_iter ctx.stats = MergeableStats(datastream.stats, ctx.stats)
def flow(parameters, datapackage, resources, stats): stats['foo_values'] = 0 def add_foo_field(package: PackageWrapper): package.pkg.descriptor['resources'][0]['schema']['fields'] += [{ 'name': parameters['attr'], 'type': 'string' }] yield package.pkg yield from package def add_foo_value(row): row[parameters['attr']] = 'foo' stats['foo_values'] += 1 return Flow(update_package(name='_'), hello_dataflows, add_foo_field, add_foo_value)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', help='Path to CSV [CSV]') parser.add_argument('-o', help='Path to Output Directory [DIR]') parser.add_argument( '-m', help='Merge this metadata (Author, License, ...) [JSON]') args = parser.parse_args() # Load additional metadata if any addedMetadata = parseExtraMetadata(args.m) # print(addedMetadata) # Load with Dataflows and save back as DataPackage Flow(load(args.i), update_package(title=addedMetadata["title"]), update_package(name=addedMetadata["name"]), update_package(license=addedMetadata["license"]), update_package(licenses=addedMetadata["licenses"]), update_package(contributors=addedMetadata["contributors"]), update_package(maintainers=addedMetadata["maintainers"]), update_package(sources=addedMetadata["sources"]), dump_to_path(args.o)).process()
def flow(self): steps = [] if not self.config.get(CONFIG_PUBLISH_ALLOWED): return None logger.info('Publisher Flow Preparing') if self.output_datapackage: logger.info('Publisher Flow: Dump To Path Denorm...') steps.extend([ dump_to_path(self.output_datapackage) ]) if self.output_db: db_table = 'dgp__{}_{}'.format( self.config.get(CONFIG_TAXONOMY_ID), self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME), ) logger.info('Publisher Flow: Dump To DB... (%s)', db_table) primary_key = self.config.get(CONFIG_PRIMARY_KEY) mapping = self.config.get(CONFIG_MODEL_MAPPING) for m in mapping: if 'columnType' in m and m['columnType']: m['slug'] = self.slugify(m['title']) m['hierarchy'] = self.slugify(m['columnType'].split(':')[0]) m['column'] = self.column(m['columnType']) m['primaryKey'] = m['columnType'] in primary_key m['measure'] = m['hierarchy'] == 'value' m['full_column'] = ( m['column'] if m['measure'] else '{}_{hierarchy}.{column}'.format(db_table, **m) ) m['label'] = self.fetch_label(m['columnType']) m['dataType'] = self.fetch_datatype(m['columnType']) prefixes = set( m['hierarchy'] for m in mapping if m.get('measure') is False ) prefixed = dict( (p, list(filter(lambda m: m.get('hierarchy') == p, mapping))) for p in prefixes ) groups = [ NormGroup([ m['column'] for m in prefixed_items ], self.ref_column(prefix), self.id_column(), db_table='{}_{}'.format(db_table, prefix)) for prefix, prefixed_items in prefixed.items() ] babbage_model = dict( dimensions=dict( (m['slug'], dict( label=m['title'], key_attribute=m['slug'], attributes=dict([ (m['slug'], dict( column=m['full_column'], label=m['title'], type=m['dataType'], )) ] + ([ (m['label']['slug'], dict( column=m['label']['full_column'], label=m['label']['title'], type=m['label']['dataType'], )) ] if m.get('label') else [])), join_column=[ self.ref_column(m['hierarchy']), self.id_column() ], **(dict( label_attribute=m['label']['slug'] ) if m.get('label') else {}) )) for m in self.config.get(CONFIG_MODEL_MAPPING) if m.get('measure') is False and m.get('primaryKey') is True ), fact_table=db_table, measures=dict( ( m['slug'], dict( column=m['column'], label=m['title'], type='number' ) ) for m in self.config.get(CONFIG_MODEL_MAPPING) if m.get('measure') is True ), hierarchies=dict( (prefix, dict( label=prefix, levels=[ m['slug'] for m in prefixed_items if m.get('primaryKey') is True ] )) for prefix, prefixed_items in prefixed.items() ), ) steps.append( update_package(babbage_model=babbage_model) ) source = self.config.get(CONFIG_URL) logger.info('Publisher Flow: _source Handling...') steps.extend([ add_field('_source', 'string', source), append_to_primary_key(['_source']), clear_by_source(self.lazy_engine(), db_table, source), ]) logger.info('Publisher Flow: Normalize...') steps.extend([ normalize_to_db( groups, db_table, RESOURCE_NAME, self.output_db, 'append' ), ]) if self.output_datapackage: logger.info('Publisher Flow: Dump To Path Norm...') steps.extend([ dump_to_path(self.output_datapackage + '-norm') ]) if self.output_es: logger.info('Publisher Flow: ES...') steps.extend([ self.update_es() ]) logger.info('Publisher Flow Prepared') return Flow(*steps)
update_package( name='covid-19', title='Novel Coronavirus 2019', views=[{ "title": "Total world to date", "resources": ["worldwide-aggregated"], "specType": "simple", "spec": { "group": "Date", "series": ["Confirmed", "Deaths"], "type": "line" } }, { "title": "Number of confirmed cases in key countries", "resources": ["key-countries-pivoted"], "specType": "simple", "spec": { "group": "Date", "series": [ "China", "US", "United_Kingdom", "Italy", "France", "Germany", "Spain", "Iran" ], "type": "line" } }, { "title": "Mortality rate in percentage", "resources": [{ "name": "worldwide-aggregated", "transform": [{ "type": "formula", "expressions": ["data['Deaths'] / data['Confirmed'] * 100 + '%'"], "asFields": ["Mortality rate"] }] }], "specType": "simple", "spec": { "group": "Date", "series": ["Mortality rate"], "type": "bar" } }, { "title": "Increase rate from previous day in confirmed cases worldwide", "resources": ["worldwide-aggregated"], "specType": "simple", "spec": { "group": "Date", "series": ["Increase rate"], "type": "bar" } }]),
"title": "Cumulative total confirmed cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Recovered", "title": "Cumulative total recovered cases to date", "type": "integer" }, { "format": "default", "groupChar": "", "name": "Deaths", "title": "Cumulative total deaths to date", "type": "integer" }]), checkpoint('processed_country_data'), # Prepare data package (name, title) and add views update_package(name='covid-19', title='Novel Coronavirus 2019', views=[{ "title": "Total world to date", "resources": ["worldwide-aggregated"], "specType": "simple", "spec": { "group": "Date", "series": ["Confirmed", "Recovered", "Deaths"], "type": "line" } }]), dump_to_path()).results()[0]
def normalize(self, package, full_name, db_table): schema = package.descriptor['resources'][0]['schema'] fields = schema['fields'] primary_key = schema['primaryKey'] mapping = [] for f in fields: m = copy.deepcopy(f) if m.get('columnType'): m['slug'] = self.slugify(m['title']) m['hierarchy'] = self.slugify(m['columnType'].split(':')[0]) m['column'] = self.column(m['columnType']) m['primaryKey'] = m['name'] in primary_key m['measure'] = m['hierarchy'] == 'value' m['full_column'] = (m['column'] if m['measure'] else '{}_{hierarchy}.{column}'.format( db_table, **m)) m['label'] = self.fetch_label(m['columnType'], mapping) m['dataType'] = self.fetch_datatype(m['columnType']) mapping.append(m) prefixes = set(m['hierarchy'] for m in mapping if m.get('measure') is False) prefixed = dict( (p, list(filter(lambda m: m.get('hierarchy') == p, mapping))) for p in prefixes) groups = [ NormGroup([m['column'] for m in prefixed_items], self.ref_column(prefix), self.id_column(), db_table='{}_{}'.format(db_table, prefix)) for prefix, prefixed_items in prefixed.items() ] babbage_model = dict( dimensions=dict( (m['slug'], dict(label=m['title'], key_attribute=m['slug'], attributes=dict([(m['slug'], dict( column=m['full_column'], label=m['title'], type=m['dataType'], ))] + ([(m['label']['slug'], dict( column=m['label']['full_column'], label=m['label']['title'], type=m['label']['dataType'], ))] if m.get('label') else [])), join_column=[ self.ref_column(m['hierarchy']), self.id_column() ], **(dict(label_attribute=m['label']['slug']) if m. get('label') else {}))) for m in mapping if m.get('measure') is False and m.get('primaryKey') is True), fact_table=db_table, measures=dict( (m['slug'], dict(column=m['column'], label=m['title'], type='number')) for m in mapping if m.get('measure') is True), hierarchies=dict((prefix, dict(label=prefix, levels=[ m['slug'] for m in prefixed_items if m.get('primaryKey') is True ])) for prefix, prefixed_items in prefixed.items()), ) return Flow( update_package(babbage_model=babbage_model), normalize_to_db(groups, db_table, RESOURCE_NAME, db_connection_string, 'append'), finalizer(lambda: babbage_models.create_or_edit( full_name, babbage_model)))
def postflow(self): metadata = self.config._unflatten().get('extra', {}).get('metadata') logger.info('UPDATING WITH METADATA %r', metadata) return Flow( update_package(**metadata) )
pivot_key_countries, delete_fields(['Country', 'Confirmed', 'Recovered', 'Deaths'], resources='key-countries-pivoted'), # Prepare data package (name, title) and add views update_package( name='covid-19', title='Novel Coronavirus 2019', views=[ { "title": "Total world to date", "resources": ["worldwide-aggregated"], "specType": "simple", "spec": { "group": "Date", "series": ["Confirmed", "Recovered", "Deaths"], "type": "line" } }, { "title": "Number of confirmed cases in key countries", "resources": ["key-countries-pivoted"], "specType": "simple", "spec": { "group": "Date", "series": ["China", "US", "United_Kingdom", "Italy", "France", "Germany", "Spain", "Iran"], "type": "line" } } ] ), dump_to_path() ).results()[0]
target_key=['Province/State', 'Country/Region', 'Date'], fields=dict(Recovered={ 'name': 'Case', 'aggregate': 'first' })), add_computed_field(target={ 'name': 'Deaths', 'type': 'number' }, operation='format', with_='{Case}'), delete_fields(['Case']), update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'), update_package(name='covid-19', title='Novel Coronavirus 2019'), dump_to_path(), checkpoint('processed_data'), # Duplicate the stream to create aggregated data duplicate(source='time-series-19-covid-combined', target_name='worldwide-aggregated', target_path='worldwide-aggregated.csv'), join_with_self(resource_name='worldwide-aggregated', join_key=['Date'], fields=dict(Date={'name': 'Date'}, Confirmed={ 'name': 'Confirmed', 'aggregate': 'sum' }, Recovered={ 'name': 'Recovered',
def flow(parameters): return Flow(update_package(**parameters))