def generate_pipeline(cls, source): project_id = slugify(source['project']) schedule = SCHEDULE_DAILY discovered_steps = cls._get_pipeline_steps() for k, config in source['config'].items(): # `k` corresponds with `label` in pipeline_steps module. if k in discovered_steps.keys(): pipeline_id = slugify('{}-{}'.format(project_id, k)) common_steps = [('add_metadata', { 'project': project_id, 'name': pipeline_id })] k_steps = discovered_steps[k](common_steps, pipeline_id, project_id, config) _steps = steps(*k_steps) else: log.warn('No {} pipeline generator available for {}'.format( k, project_id)) continue pipeline_details = {'pipeline': _steps} if schedule is not None: pipeline_details['schedule'] = {'crontab': schedule} yield pipeline_id, pipeline_details
def _get_safe_entity(entity): '''Get a url safe version of the entity, base on starting character.''' if entity.startswith('@'): return 'at-{}'.format(slugify(entity)) elif entity.startswith('#'): return 'hash-{}'.format(slugify(entity)) elif entity.startswith('url:'): return slugify(entity) else: raise ValueError(ENTITY_VALUE_ERROR_MSG.format(entity))
def process_row(row, row_index, spec, resource_index, parameters, stats): resource_matcher = ResourceMatcher(parameters['resource-name']) if resource_matcher.match(spec['name']): fingerprint_field = parameters['fingerprint-field'] name_field = parameters['name-field'] row[fingerprint_field] = slugify(row[name_field], to_lower=True) return row
def process_resource(res): all_fingerprints = set() for row in res: name = None for src_field in source_fields: src_value = row[src_field] if src_value: if name is None: name = src_value fingerprint = slugify(src_value, to_lower=True) if fingerprint in all_fingerprints: continue all_fingerprints.add(fingerprint) yield {name_field: name, fingerprint_field: fingerprint}
def dumper_flow(source, base): _, _, resource_name = extract_names(source) dataset_id, db_table, _ = extract_storage_ids(source) kinds = sorted( set(f['columnType'].split(':')[0] for f in source['fields']) - {'value'}) resources = [slugify(kind, separator='_') for kind in kinds] deps = ['dimension_flow_{}'.format(res) for res in resources] for i, resource, dep, kind in zip(range(len(kinds)), resources, deps, kinds): res_db_table = '{}_{}'.format(db_table, i) steps = [('load_resource', { 'url': 'dependency://' + base + '/' + dep, 'resource': resource }), ('set_types', ), ('fiscal.helpers.fix_null_pks', ), ('dump.to_sql', { 'tables': { res_db_table: { 'resource-name': resource } } })] yield steps, [dep], resource steps = [('load_resource', { 'url': 'dependency://' + base + '/normalized_flow', 'resource': resource_name }), ('fiscal.helpers.fix_null_pks', ), ('dump.to_sql', { 'tables': { db_table: { 'resource-name': resource_name } } })] yield steps, ['normalized_flow'], '' yield [ ('fiscal.update_model_in_registry', { 'dataset-id': dataset_id, 'loaded': True }), ], ['dumper_flow'], 'update_status'
def dimension_flow(source, base): title, dataset_name, resource_name = extract_names(source) kinds = sorted( set(f['columnType'].split(':')[0] for f in source['fields']) - {'value'}) resources = [slugify(kind, separator='_') for kind in kinds] pipeline_ids = ['dimension_{}'.format(res) for res in resources] for resource, pipeline_id, kind in zip(resources, pipeline_ids, kinds): headers = [ f['header'] for f in source['fields'] if f['columnType'].startswith(kind + ':') or f['columnType'] == kind ] steps = [('load_resource', { 'url': 'dependency://' + base + '/denormalized_flow', 'resource': resource_name }), ('concatenate', { 'target': { 'name': resource }, 'fields': dict((h, []) for h in headers) }), ('fiscal.helpers.save_primarykey', ), ('join', { 'source': { 'name': resource, 'key': headers, 'delete': True }, 'target': { 'name': resource, 'key': None }, 'fields': dict((h, None) for h in headers) }), ('fiscal.helpers.load_primarykey', ), ('fiscal.helpers.enumerate', ), ('dump.to_path', { 'out-path': 'normalized/' + resource })] yield steps, ['denormalized_flow'], resource
def generate_pipeline(cls, source, wp): pipeline_id = dataset_name = slugify(source['name']) host = source['udata-instance'] action = source['data-kind'] if action == 'datasets-list': schedule = SCHEDULE_MONTHLY pipeline_steps = steps(*[('udata.catalog', { 'udata-instance': host }), ('add_metadata', { 'name': dataset_name }), ('dump.to_zip', { 'out-file': 'udata-list.zip' })]) pipeline_details = { 'pipeline': pipeline_steps, 'schedule': { 'crontab': schedule } } yield pipeline_id, pipeline_details if action == 'dataset': pipeline_steps = steps(*[('udata.fetch_metadata', { 'host': source['udata-instance'], 'kind': 'dataset', 'id': source['dataset'] }), ('add_metadata', { 'name': source['name'] }), ('dump.to_path', { 'handle-non-tabular': 'true', 'pretty-descriptor': 'true' })]) pipeline_details = {'pipeline': pipeline_steps} yield pipeline_id, pipeline_details
def generate_pipeline(cls, source): for pipeline_id_prefix, defs in source.items(): repository = defs['repository'] base_path = defs.get('base-path', 'pipelines/') # issues issue_policy = defs.get('issues', {}) # pull requests pr_policy = defs.get('pull-requests') # code code_policy = defs.get('code') if code_policy is not None: yield from cls.fetch_code(code_policy, repository, base_path) issues_url = '/repos/{}/issues'.format(repository) issues = URL_GETTER.get(issues_url) if issues is not None: for issue in issues: for pipeline_id_format, pipeline_steps in \ cls.handle_combined_issue(repository, base_path, issue, issue_policy, pr_policy): title_slug = slugify(issue['title']) fmt = { 'issue-id': issue['number'], 'title-slug': title_slug } pipeline_id = pipeline_id_format.format(**fmt) pipeline_id = urljoin(pipeline_id_prefix, pipeline_id) pipeline_details = { 'title': issue['title'], 'pipeline': pipeline_steps } if issue.get('body') is not None: pipeline_details['description'] = issue['body'] yield pipeline_id, pipeline_details
def form_collector(source_id, source_type, latest_date): start_date = FAR_PAST_START_DATE if latest_date: start_date = latest_date.date() response = _request_data_from_google_spreadsheet(start_date) resource_content = [] headers = response['table']['cols'] headers = [slugify(h['label'].lower()) for h in headers] for r in response['table']['rows']: row = r['c'] row_dict = {} for i, v in enumerate(row): if v is not None: row_dict[headers[i]] = v.get('f') or v.get('v') else: row_dict[headers[i]] = None output_date = dateutil.parser.parse(row_dict.get('date')).date() \ if row_dict.get('date') is not None else None res_row = { 'source_id': source_id, 'source_type': source_type, 'source': 'gsheets', 'source_timestamp': dateutil.parser.parse(row_dict.get('timestamp')), 'source_email': row_dict.get('email-address'), 'output_title': row_dict.get('title'), 'output_type': row_dict.get('type-of-output'), 'output_organization': row_dict.get('for-what-organisation'), 'output_person': row_dict.get('who-did-this'), 'output_link': row_dict.get('link-if-published'), 'output_additional_information': row_dict.get('additional-information'), 'output_date': output_date } resource_content.append(res_row) return resource_content
latest_date, latest_iter = get_latest_date(next(res_iter)) yield latest_iter else: latest_date = None yield from res_iter yield form_collector(source_id, source_type, latest_date) parameters, datapackage, res_iter = ingest() sheet_id = parameters['sheet_id'] gid = parameters['gid'] source_type = parameters['source_type'] source_id = '{0}/{1}'.format(sheet_id, gid) resource = { 'name': slugify(sheet_id).lower(), 'path': 'data/{}.csv'.format(slugify(sheet_id)) } headers = [ 'source', 'source_type', 'source_timestamp', 'source_email', 'output_title', 'output_type', 'output_organization', 'output_person', 'output_link', 'output_additional_information', 'output_date' ] resource['schema'] = { 'fields': [{ 'name': h, 'type': 'string' } for h in headers] }
metadata = r.json() return metadata dataset_metadata = metadata(url) datapackage['udata'] = dataset_metadata # logging.info(datapackage) for resource in dataset_metadata['resources']: # logging.info(resource) name = slugify(resource["title"].lower()) path = resource["url"].split('/')[-1] format = resource["format"] url = resource["url"] logging.info(path) if path is not '': datapackage['resources'].append({ 'name': name, PROP_STREAMED_FROM: url, 'format': format, 'path': path })
growth = _request_growth_history_from_mailchimp( list_id, '{}-{:02d}'.format(activity_date.year, activity_date.month) ) res_row['subscribers'] = growth['existing'] resource_content.append(res_row) return resource_content parameters, datapackage, res_iter = ingest() list_id = parameters['list_id'] resource = { 'name': slugify(list_id), 'path': 'data/{}.csv'.format(slugify(list_id)) } headers = ['source', 'date', 'list_id', 'subs', 'unsubs', 'subscribers', 'campaigns_sent'] resource['schema'] = {'fields': [{'name': h, 'type': 'string'} for h in headers]} datapackage['resources'].append(resource) def process_resources(res_iter, datapackage, list_id): def get_latest_row(first): latest_row = None
def generate_pipeline(cls, source): for item in source: entity_slug = slugify(item['entity'], to_lower=True, separator='_') ids = [entity_slug, item['year']] if 'subsidiary' in item: ids.append(item['subsidiary']) pipeline_id = '_'.join(str(i) for i in ids) pipeline = [ { 'run': 'add_metadata', 'parameters': { 'name': pipeline_id, 'title': 'CRD/IV data for {entity} in the year {year}'.format( **item) }, }, ] for input in item['inputs']: if input['kind'] == 'pdf': parameters = input['parameters'] parameters['transpose'] = input.get('transpose', False) parameters['url'] = input['url'] parameters['headers'] = item['model']['headers'] pipeline.append({ 'run': 'od4tj.tabula_resource', 'parameters': parameters }) pipeline.append({ 'run': 'concatenate', 'parameters': { 'sources': 'tabula-.+', 'target': { 'name': 'crdiv_data' }, 'fields': dict((h['mapping'], []) for h in (item['model']['headers'] + [{ 'mapping': 'url' }])) } }) pipeline.extend([ { 'run': 'od4tj.clean_locations', 'parameters': { 'resource_name': 'crdiv_data', 'raw_field': 'country', 'clean_field_code': 'country_code', 'clean_field_name': 'country_name', } }, { 'run': 'od4tj.add_constants', 'parameters': { 'year': item['year'], 'entity': item['entity'], 'subsidiary': item.get('subsidiary'), 'currency': item['model']['currency'].upper() } }, { 'run': 'od4tj.validate_countries', 'parameters': { 'resource_name': 'crdiv_data', 'raw_field': 'country', 'clean_field': 'country_code', } }, { 'run': 'od4tj.fix_numbers', 'parameters': { 'factor': item['model']['factor'], 'group_char': item['model'].get('group_char', ','), 'decimal_char': item['model'].get('decimal_char', '.'), } }, { 'run': 'set_types', }, { 'run': 'od4tj.validate_totals', 'parameters': { 'totals': item.get('processing', {}).get('totals', {}), 'factor': item['model']['factor'], } }, ]) pipeline.append({ 'run': 'aws.dump.to_s3', 'parameters': { 'bucket': 'od4tj-filestore.okfn.org', 'path': 'crd_iv_datapackages/{}_{}'.format(entity_slug, item['year']) } }) pipeline.append({ 'run': 'dump.to_path', 'parameters': { 'out-path': '/tmp/', } }) yield pipeline_id, {'pipeline': pipeline}
def normalized_flow(source, base): _, _, resource_name = extract_names(source) dataset_id, db_table, _ = extract_storage_ids(source) kinds = sorted(set( f['columnType'].split(':')[0] for f in source['fields'] ) - {'value'}) resources = [ slugify(kind, separator='_') for kind in kinds ] db_tables = dict( (res, '{}_{}'.format(db_table, i)) for i, res in enumerate(resources) ) db_tables[''] = db_table deps = [ 'dimension_flow_{}'.format(res) for res in resources ] steps = [ ('load_metadata', { 'url': 'dependency://' + base + '/denormalized_flow', }), ] steps.extend([ ('load_resource', { 'url': 'dependency://' + base + '/' + dep, 'resource': resource }) for resource, dep in zip(resources, deps) ]) steps.extend([ ('load_resource', { 'url': 'dependency://' + base + '/denormalized_flow', 'resource': resource_name }), ('fiscal.create_babbage_model', { 'db-tables': db_tables }), ]) for resource, kind in zip(resources, kinds): headers = [ f['header'] for f in source['fields'] if f['columnType'].startswith(kind+':') or f['columnType'] == kind ] steps.extend([ ('join', { 'source': { 'name': resource, 'key': headers, 'delete': True }, 'target': { 'name': resource_name, 'key': headers }, 'fields': { resource + '_id': { 'name': ID_COLUMN_NAME } } }), ('delete_fields', { 'resources': resource_name, 'fields': headers }), ]) steps.extend([ ('add_metadata', { 'savedPk': [resource + '_id' for resource in resources] }), ('fiscal.helpers.load_primarykey', {}), ('fiscal.update_model_in_registry', { 'dataset-id': dataset_id, 'loaded': False }), ('dump.to_path', { 'out-path': 'normalized/final' }) ]) yield steps, deps + ['denormalized_flow'], ''
def generate_pipeline(cls, source): for item in source: entity_slug = slugify(item['entity'], to_lower=True, separator='_') pipeline_id = '{}/{}'.format(entity_slug, item['year']) pipeline = [{ 'run': 'add_metadata', 'parameters': { 'name': '{}_{}'.format(entity_slug, item['year']), 'title': 'CRD/IV data for {entity} in the year {year}'.format( **item) }, }, { 'run': 'add_resource', 'parameters': { 'name': 'country-codes', 'url': 'https://raw.githubusercontent.com/datasets/country-codes/master/data/country-codes.csv' }, }, { 'run': 'stream_remote_resources', }, { 'run': 'od4tj.prepare_country_fingerprints', 'parameters': { 'resource-name': 'country-codes', 'source-fields': ['name', 'official_name_en', 'official_name_fr'], 'name-field': 'name', 'fingerprint-field': 'fingerprint' } }] for input in item['inputs']: if input['kind'] == 'pdf': for dimension in input['parameters']['dimensions']: parameters = {} parameters['dimensions'] = dimension parameters['url'] = input['url'] parameters['headers'] = item['model']['headers'] pipeline.append({ 'run': 'od4tj.tabula_resource', 'parameters': parameters }) pipeline.append({ 'run': 'concatenate', 'parameters': { 'sources': 'tabula-.+', 'target': { 'name': 'crdiv_data' }, 'fields': dict((h['name'], []) for h in item['model']['headers']) } }) pipeline.extend([ { 'run': 'od4tj.fingerprint_countries', 'parameters': { 'resource-name': 'crdiv_data', 'name-field': 'country', 'fingerprint-field': 'country-name-fingerprint' } }, { 'run': 'join', 'parameters': { 'source': { 'name': 'country-codes', 'key': ['fingerprint'], 'delete': True }, 'target': { 'name': 'crdiv_data', 'key': ['country-name-fingerprint'], }, 'fields': { 'country_name': { 'name': 'name' } }, 'full': True, } }, { 'run': 'od4tj.add_constants', 'parameters': { 'year': item['year'], 'entity': item['entity'] } }, { 'run': 'od4tj.validate_countries' }, { 'run': 'od4tj.fix_numbers', }, { 'run': 'set_types', }, ]) pipeline.append({ 'run': 'aws.dump.to_s3', 'parameters': { 'bucket': 'od4tj-filestore.okfn.org', 'path': 'crd_iv_datapackages/{}_{}'.format(entity_slug, item['year']) } }) yield pipeline_id, {'pipeline': pipeline}
# add active_users to today's value if date == today: res_row['active_users'] = active_users_response # preserve active_users value in latest_row if date == latest_date and latest_row['active_users']: res_row['active_users'] = latest_row['active_users'] resource_content.append(res_row) return resource_content parameters, datapackage, res_iter = ingest() domain = parameters['domain'] resource = { 'name': slugify(domain), 'path': 'data/{}.csv'.format(slugify(domain)) } headers = [ 'domain', 'source', 'date', 'new_users', 'new_topics', 'new_posts', 'visits', 'active_users' ] resource['schema'] = { 'fields': [{ 'name': h, 'type': 'string' } for h in headers] } datapackage['resources'].append(resource)
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: steps.append(('measure.datastore_get_latest', { 'resource-name': 'latest-project-entries', 'table': 'codepackaging', 'engine': settings.get('DB_ENGINE'), 'distinct_on': ['project_id', 'package', 'source'] })) if 'npm' in config: for package in config['npm']['packages']: steps.append(('measure.add_npm_resource', { 'package': slugify(package) })) if 'pypi' in config: for package in config['pypi']['packages']: steps.append(('measure.add_pypi_resource', { 'package': slugify(package) })) if 'rubygems' in config: for gem in config['rubygems']['gems']: steps.append(('measure.add_rubygems_resource', {'gem_id': gem})) if 'packagist' in config: for package in config['packagist']['packages']: steps.append(('measure.add_packagist_resource', { 'package': package })) steps.append(('measure.remove_resource', { 'name': 'latest-project-entries' })) steps.append(('concatenate', { 'target': { 'name': 'code-packaging', 'path': 'data/code-packaging.csv' }, 'fields': { 'date': [], 'downloads': [], 'total_downloads': [], 'source': [], 'package': [] } })) steps.append(('set_types', { 'types': { 'downloads': { 'type': 'integer' }, 'total_downloads': { 'type': 'integer' }, 'source': { 'type': 'string' }, 'date': { 'type': 'date' }, 'package': { 'type': 'string' } } })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development mode if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings.get('DB_ENGINE'), 'tables': { 'codepackaging': { 'resource-name': 'code-packaging', 'mode': 'update', 'update_keys': ['project_id', 'date', 'package', 'source'] } } })) return steps
def generate_pipeline(cls, source): title = source['title'] dataset_name = source.get('dataset-name', title) dataset_name = slugify(dataset_name).lower() pipeline_id = dataset_name resource_name = source.get('resource-name', dataset_name) for data_source in source['sources']: if data_source['url'].endswith('.csv'): data_source['mediatype'] = 'text/csv' if 'name' not in data_source: data_source['name'] = slugify( os.path.basename(data_source['url']) ) model_params = { 'options': dict( (f['header'], f['options']) for f in source['fields'] if 'options' in f ), 'os-types': dict( (f['header'], f['osType']) for f in source['fields'] ), 'titles': dict( (f['header'], f['title']) for f in source['fields'] if 'title' in f ), } extra_measures = [] measure_handling = [] if 'measures' in source: measures = source['measures'] normalise_measures = ('fiscal.normalise_measures', { 'measures': measures['mapping'] }) if 'title' in measures: normalise_measures[1]['title'] = measures['title'] measure_handling.append(normalise_measures) model_params['os-types']['value'] = 'value' model_params['options']['value'] = { 'currency': measures['currency'] } extra_measures = [ (measure, []) for measure in source['measures']['mapping'].keys() ] if 'currency-conversion' in measures: currency_conversion = measures['currency-conversion'] date_measure = currency_conversion.get('date_measure') if date_measure is None: date_measure = [ f['header'] for f in source['fields'] if f.get('osType', '').startswith('date:') ][0] currencies = measures.get('currencies', ['USD']) normalise_currencies = ('fiscal.normalise_currencies', { 'measures': ['value'], 'date-field': date_measure, 'to-currencies': currencies, 'from-currency': measures['currency'] }) if 'title' in currency_conversion: normalise_currencies[1]['title'] = measures['title'] measure_handling.append(normalise_currencies) for currency in currencies: measure_name = 'value_{}'.format(currency) model_params['os-types'][measure_name] = 'value' model_params['options'][measure_name] = { 'currency': currency } dedpulicate_lines = source.get('deduplicate') is True dedpulicate_steps = [] if dedpulicate_lines: dedpulicate_steps.append(( 'set_types', { 'types': dict( (f['header'], dict( type='number', **f.get('options', {}) ) ) for f in source['fields'] if f['osType'] == 'value' ) } )) dedpulicate_steps.append(( 'join', { 'source': { 'name': resource_name, 'key': [ f['header'] for f in source['fields'] if f['osType'] != 'value' ], 'delete': True }, 'target': { 'name': resource_name, 'key': None }, 'fields': dict( (f['header'], { 'name': f['header'], 'aggregate': 'any' if f['osType'] != 'value' else 'sum' }) for f in source['fields'] ) } )) partial_output_file = '{}.fdp.partial.zip'.format(pipeline_id) output_file = '{}.fdp.zip'.format(pipeline_id) pipeline_steps = [ ( 'add_metadata', { 'title': title, 'name': dataset_name, } ) ] + [ ('add_resource', source) for source in source['sources'] ] + [ ('stream_remote_resources', {}, True), ('concatenate', { 'target': { 'name': resource_name }, 'fields': dict( [ (f['header'], f.get('aliases', [])) for f in source['fields'] ] + extra_measures ) }), ] + dedpulicate_steps + [ (step['processor'], step.get('parameters', {})) for step in source.get('postprocessing', []) ] + measure_handling + [ ('fiscal.model', model_params), ('dump.to_zip', { 'out-file': partial_output_file, }), ('fiscal.split_resource_per_fiscal_year_and_dump_to_zip', { 'in-file': partial_output_file, 'out-file': output_file, }), ('fiscal.upload', { 'in-file': output_file, 'publish': True }), ] pipeline_details = { 'pipeline': steps(*pipeline_steps), } yield pipeline_id, pipeline_details
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: for repo in config['github']['repositories']: steps.append(('measure.add_github_resource', { 'name': slugify(repo), 'repo': repo, 'map_fields': { 'repository': 'name', 'watchers': 'subscribers_count', 'stars': 'stargazers_count' } })) steps.append(('concatenate', { 'sources': [slugify(repo) for repo in config['github']['repositories']], 'target': { 'name': 'code-hosting', 'path': 'data/code-hosting.json'}, 'fields': { 'repository': [], 'watchers': [], 'stars': [], 'source': [], 'date': []} })) steps.append(('set_types', { 'types': { 'repository': { 'type': 'string', }, 'watchers': { 'type': 'integer' }, 'stars': { 'type': 'integer' }, 'date': { 'type': 'date', }, } })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development mode if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings['DB_ENGINE'], 'tables': { 'codehosting': { 'resource-name': 'code-hosting', 'mode': 'update', 'update_keys': ['repository', 'source', 'project_id', 'date'] } } })) return steps
since=start_date_frame.strftime(FACEBOOK_API_DATE_RANGE_FORMAT), until=end_date_frame.strftime(FACEBOOK_API_DATE_RANGE_FORMAT) ) for metric in daily_metrics: aggregated_metrics[metric['facebook_metric']] += \ _add_collected_metric_to_aggregation(frame_response, metric) start_date_frame = end_date_frame return aggregated_metrics parameters, datapackage, res_iter = ingest() project_id = parameters['project_id'] entity = parameters['entity'] safe_entity = slugify(entity).lower() resource = { 'name': safe_entity, 'path': 'data/{}.csv'.format(safe_entity) } entity_type = 'page' resource_content = [] row = { 'entity': entity, 'entity_type': entity_type, 'source': 'facebook' } lifetime_metrics = _get_lifetime_metrics_from_source(entity)
row = { 'package': package, 'source': 'npm', 'date': dateutil.parser.parse(response['start']).date(), 'downloads': response['downloads'] } resource_content.append(row) return resource_content parameters, datapackage, res_iter = ingest() package = parameters['package'] resource = { 'name': slugify(package), 'path': 'data/{}.csv'.format(slugify(package)) } headers = ['package', 'source', 'date', 'downloads'] resource['schema'] = { 'fields': [{ 'name': h, 'type': 'string' } for h in headers] } datapackage['resources'].append(resource) def process_resources(res_iter, datapackage, package):
def modify_datapackage(dp, parameters, *_): db_tables = parameters['db-tables'] model = dp['model'] field_types = dict((x['slug'], x['type']) for x in dp['resources'][-1]['schema']['fields']) bbg_hierarchies = {} bbg_dimensions = {} bbg_measures = {} # Iterate on dimensions for hierarchy_name, h_props in model['dimensions'].items(): # Append to hierarchies hierarchy_name = slugify(hierarchy_name, separator='_') hierarchy = dict(label=hierarchy_name, levels=h_props['primaryKey']) bbg_hierarchies[hierarchy_name] = hierarchy # Get all hierarchy columns attributes = h_props['attributes'] attributes = list(attributes.items()) # Separate to codes and labels codes = dict(filter(lambda x: 'labelfor' not in x[1], attributes)) labels = dict( map(lambda y: (y[1]['labelfor'], y[1]), filter(lambda x: 'labelfor' in x[1], attributes))) # For each code, create a babbage dimension for fieldname, attribute in codes.items(): dimension_name = fieldname bbg_attributes = { fieldname: dict(column='.'.join([db_tables[hierarchy_name], fieldname]), label=attribute.get('title', attribute['source']), type=field_types[fieldname]) } bbg_dimension = dict( attributes=bbg_attributes, key_attribute=fieldname, label=attribute.get('title'), join_column=[hierarchy_name + '_id', ID_COLUMN_NAME]) label = labels.get(fieldname) if label is not None: fieldname = label['source'] attribute = label bbg_attributes.update({ fieldname: dict(column='.'.join( [db_tables[hierarchy_name], fieldname]), label=attribute.get('title', attribute['source']), type=field_types[fieldname]) }) bbg_dimension.update(dict(label_attribute=fieldname)) bbg_dimensions[dimension_name] = bbg_dimension # Iterate on measures for measurename, measure in model['measures'].items(): bbg_measures[measurename] = dict(column=measurename, label=measure.get( 'title', attribute['source']), type=field_types[measurename]) dp['babbageModel'] = dict(fact_table=db_tables[''], dimensions=bbg_dimensions, hierarchies=bbg_hierarchies, measures=bbg_measures) return dp
ckan_error = get_ckan_error(response) if ckan_error: if 'Not found: Resource was not found.' in ckan_error.get('message', []): log.exception('CKAN resource {} was not found.'.format(resource_id)) else: log.exception('CKAN returned an error: ' + json.dumps(ckan_error)) raise Exception resource = response['result'] if 'name' in resource: if 'title' not in resource: resource['title'] = resource['name'] resource['name'] = slugify(resource['name']).lower() if 'format' in resource: resource['format'] = resource['format'].lower() if 'url' in resource: resource['path'] = PATH_PLACEHOLDER resource[PROP_STREAMED_FROM] = resource['url'] del resource['url'] del resource['hash'] resource.update(parameters) datapackage['resources'].append(resource)
def denormalized_flow(source, base): title, dataset_name, resource_name = extract_names(source) dataset_id, _, _ = extract_storage_ids(source) original_datapackage_url = source.get('datapackage-url') for data_source in source['sources']: if data_source['url'].endswith('.csv'): data_source['mediatype'] = 'text/csv' if 'name' not in data_source: data_source['name'] = slugify(os.path.basename(data_source['url']), separator='_').lower() model_params = { 'options': dict((f['header'], f['options']) for f in source['fields'] if 'options' in f), 'os-types': dict((f['header'], f['columnType']) for f in source['fields']), 'titles': dict((f['header'], f['title']) for f in source['fields'] if 'title' in f), } extra_measures = [] measure_handling = [] if 'measures' in source: measures = source['measures'] normalise_measures = ('fiscal.normalise_measures', { 'measures': measures['mapping'] }) if 'title' in measures: normalise_measures[1]['title'] = measures['title'] measure_handling.append(normalise_measures) model_params['os-types']['value'] = 'value' model_params['options']['value'] = {'currency': measures['currency']} extra_measures = [(measure, []) for measure in source['measures']['mapping'].keys()] if 'currency-conversion' in measures: currency_conversion = measures['currency-conversion'] date_measure = currency_conversion.get('date_measure') if date_measure is None: date_measure = [ f['header'] for f in source['fields'] if f.get('columnType', '').startswith('date:') ][0] currencies = measures.get('currencies', ['USD']) normalise_currencies = ('fiscal.normalise_currencies', { 'measures': ['value'], 'date-field': date_measure, 'to-currencies': currencies, 'from-currency': measures['currency'] }) if 'title' in currency_conversion: normalise_currencies[1]['title'] = measures['title'] measure_handling.append(normalise_currencies) for currency in currencies: measure_name = 'value_{}'.format(currency) model_params['os-types'][measure_name] = 'value' model_params['options'][measure_name] = {'currency': currency} dedpulicate_lines = source.get('deduplicate') is True dedpulicate_steps = [] if dedpulicate_lines: dedpulicate_steps.append(('set_types', { 'types': dict((f['header'], dict(type='number', **f.get('options', {}))) for f in source['fields'] if f['columnType'] == 'value') })) dedpulicate_steps.append(( 'join', { 'source': { 'name': resource_name, 'key': [ f['header'] for f in source['fields'] if f['columnType'] != 'value' ], 'delete': True }, 'target': { 'name': resource_name, 'key': None }, 'fields': dict(( f['header'], { 'name': f['header'], 'aggregate': 'any' if f['columnType'] != 'value' else 'sum' # noqa }) for f in source['fields']) })) load_metadata_steps = [] if original_datapackage_url: load_metadata_steps.append(('load_metadata', { 'url': original_datapackage_url })) pipeline_steps = load_metadata_steps + [ ('add_metadata', { 'title': title, 'name': dataset_name, 'revision': source.get('revision', 0), }), ('fiscal.update_model_in_registry', { 'dataset-id': dataset_id, 'loaded': False }), ] + [('add_resource', source) for source in source['sources']] + [ ('stream_remote_resources', {}), ('concatenate', { 'target': { 'name': resource_name }, 'fields': dict([(f['header'], f.get('aliases', [])) for f in source['fields']] + extra_measures) }), ] + dedpulicate_steps + [(step['processor'], step.get('parameters', {})) for step in source.get('postprocessing', []) ] + measure_handling + [ ('fiscal.model', model_params), ('fiscal.collect-fiscal-years', ), ('set_types', ), ('dump.to_path', { 'out-path': 'denormalized', }), ] yield pipeline_steps, [], ''