def get_knesset_dataservice_pipeline(cls, pipeline_id, pipeline): if os.environ.get("DATASERVICE_LOAD_FROM_URL"): pipeline_steps = [ ('load_resource', { "url": "http://storage.googleapis.com/knesset-data-pipelines/data/{}/{}/datapackage.json" .format(pipeline['schemas-bucket'], pipeline_id), "resource": pipeline_id }), ] else: pipeline_steps = [ ('..datapackage_pipelines_knesset.dataservice.processors.add_dataservice_collection_resource', pipeline["dataservice-parameters"]), ('..datapackage_pipelines_knesset.common.processors.throttle', { 'rows-per-page': 50 }), ] pipeline_steps += [('dump.to_path', { 'out-path': '../data/{}/{}'.format(pipeline['schemas-bucket'], pipeline_id) })] yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
def generate_pipeline(cls, source): project_id = slugify(source['project']) schedule = SCHEDULE_DAILY discovered_steps = cls._get_pipeline_steps() for k, config in source['config'].items(): # `k` corresponds with `label` in pipeline_steps module. if k in discovered_steps.keys(): pipeline_id = slugify('{}-{}'.format(project_id, k)) common_steps = [('add_metadata', { 'project': project_id, 'name': pipeline_id })] k_steps = discovered_steps[k](common_steps, pipeline_id, project_id, config) _steps = steps(*k_steps) else: log.warn('No {} pipeline generator available for {}'.format( k, project_id)) continue pipeline_details = {'pipeline': _steps} if schedule is not None: pipeline_details['schedule'] = {'crontab': schedule} yield pipeline_id, pipeline_details
def generate_pipeline(cls, source, base): all_pipeline_ids = [] for flow in FLOWS: for pipeline_steps, deps, suffix in flow(source, base): pipeline_id = base + '/' + flow.__name__ if suffix: pipeline_id += '_' + suffix pipeline_details = { 'pipeline': steps(*pipeline_steps), 'dependencies': [dict(pipeline=base + '/' + dep) for dep in deps] } all_pipeline_ids.append(pipeline_id) yield pipeline_id, pipeline_details if not source.get('suppress-os', False): for flow in OS_FLOWS: for pipeline_steps, deps, suffix in flow(source, base): pipeline_id = base + '/' + flow.__name__ if suffix: pipeline_id += '_' + suffix pipeline_details = { 'pipeline': steps(*pipeline_steps), 'dependencies': [dict(pipeline=base + '/' + dep) for dep in deps] } all_pipeline_ids.append(pipeline_id) yield pipeline_id, pipeline_details # clean up dependencies if keep-artifacts is not True. if not source.get('keep-artifacts', False): dirs_to_clean = ["denormalized", "normalized", "final"] pipeline_id = base + '/' + 'cleanup-dependencies' pipeline_details = { 'pipeline': steps(('fiscal.cleanup-dependencies', { 'dirs_to_clean': dirs_to_clean })), 'dependencies': [{ 'pipeline': dep } for dep in all_pipeline_ids] } yield pipeline_id, pipeline_details
def get_db_dump_pipeline(cls, pipeline_id, pipeline): pipeline_steps = [ ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/votes/votes/datapackage.json", "resource": "votes" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/committees/kns_committee/datapackage.json", "resource": "kns_committee" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/people/members/joined-mks/datapackage.json", "resource": "mk_individual" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/committee-meeting-attendees-mks-stats/datapackage.json", "resource": "mk_attendance" }), # remove positions and altnames because oknesset DB doesn't support jsonb # TODO: normalize altnames and positions to mk_individual or other tables ("set_types", { "resources": "mk_individual", "types": { "positions": None, "altnames": None } }), ("dump.to_sql", { "engine": "env://DPP_DB_ENGINE", "tables": { "next_votes": { "resource-name": "votes", "mode": "rewrite" }, "next_kns_committee": { "resource-name": "kns_committee", "mode": "rewrite" }, "next_mk_individual": { "resource-name": "mk_individual", "mode": "rewrite" }, "next_mk_attendance": { "resource-name": "mk_attendance", "mode": "rewrite" }, } }) ] yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
def filter_pipeline(cls, pipeline_id, pipeline): if pipeline.get("pipeline-type") == "knesset dataservice": yield from cls.get_knesset_dataservice_pipeline( pipeline_id, pipeline) elif pipeline.get("pipeline-type") == "all package": yield from cls.get_all_package_pipeline(pipeline_id, pipeline) else: pipeline["pipeline"] = steps(*[(step["run"], step.get("parameters", {})) for step in pipeline["pipeline"]]) yield pipeline_id, pipeline
def generate_pipeline(cls, source, wp): pipeline_id = dataset_name = slugify(source['name']) host = source['udata-instance'] action = source['data-kind'] if action == 'datasets-list': schedule = SCHEDULE_MONTHLY pipeline_steps = steps(*[('udata.catalog', { 'udata-instance': host }), ('add_metadata', { 'name': dataset_name }), ('dump.to_zip', { 'out-file': 'udata-list.zip' })]) pipeline_details = { 'pipeline': pipeline_steps, 'schedule': { 'crontab': schedule } } yield pipeline_id, pipeline_details if action == 'dataset': pipeline_steps = steps(*[('udata.fetch_metadata', { 'host': source['udata-instance'], 'kind': 'dataset', 'id': source['dataset'] }), ('add_metadata', { 'name': source['name'] }), ('dump.to_path', { 'handle-non-tabular': 'true', 'pretty-descriptor': 'true' })]) pipeline_details = {'pipeline': pipeline_steps} yield pipeline_id, pipeline_details
def get_all_package_pipeline(cls, pipeline_id, pipeline, base): assert pipeline['base-url'].startswith( 'https://storage.googleapis.com/knesset-data-pipelines/') base_path = pipeline['base-url'].replace( 'https://storage.googleapis.com/knesset-data-pipelines/', '') pipeline_steps = [] dependencies = [] for resource in pipeline["resources"]: pipeline_steps += [("load_resource", { "url": '../' + base_path + resource["name"] + "/datapackage.json", "resource": resource.get("resource", resource["name"]) })] dependencies.append({ 'datapackage': base_path + resource["name"] + "/datapackage.json" }) if resource.get("resource"): pipeline_steps += [("..rename_resource", { "src": resource["resource"], "dst": resource["name"] })] if resource.get('set_types'): pipeline_steps += [("set_types", { "resources": resource["name"], "types": resource['set_types'] })] # pipeline_steps += [('dump.to_path', # {'out-path': pipeline["out-path"]})] pipeline_steps += [('dump.to_zip', { 'out-file': pipeline["out-path"] + "/datapackage.zip", 'pretty-descriptor': True })] storage_path = '{}all'.format(pipeline['base-url'].replace( 'https://storage.googleapis.com/knesset-data-pipelines/', '')) storage_url = "http://storage.googleapis.com/knesset-data-pipelines/{}".format( storage_path) pipeline_steps += [( 'knesset.dump_to_path', { 'storage-url': storage_url, 'out-path': '../{}'.format(storage_path) }, )] yield os.path.join(base, pipeline_id), { 'pipeline': steps(*pipeline_steps), 'schedule': { 'crontab': '10 1 * * *' }, 'dependencies': dependencies }
def generate_pipeline(cls, source): pipeline = [] for action in source.get('actions', []): uuid, verb, options = action['uuid'], action['verb'], action['options'] def step(processor, params): params['uuid'] = uuid params['revision'] = options['revision'] return (processor, params, True) if verb == 'source': pipeline.append(step('datapipes.load_source', {'url': options['url'], 'res_name': uuid})) elif verb == 'skip': if options['kind'] == 'rows': pipeline.append(step('datapipes.skip_rows', {'amount': options['amount']})) elif options['kind'] == 'columns': pipeline.append(step('datapipes.skip_columns', {'amount': options['amount']})) elif verb == 'mutate': pipeline.append(step('datapipes.mutate', {'field': options['field'], 'options': options['options']})) elif verb == 'filter': pipeline.append(step('datapipes.filter', { 'field': options['field'], 'op': options['op'], 'arg': options['arg'], } )) pipeline.append(step('datapipes.noop', {})) elif verb == 'headers': pipeline.append(step('datapipes.take_headers', {})) elif verb == 'noop': pipeline.append(step('datapipes.noop', {})) pipeline.append(('datapipes.noop', {'uuid': 'last'}, False)) yield 'dp', { 'pipeline': steps( ('datapipes.init', ), *pipeline, ) }
def get_knesset_dataservice_pipeline(cls, pipeline_id, pipeline): storage_path = "data/{}/{}".format(pipeline['schemas-bucket'], pipeline_id) storage_url = "http://storage.googleapis.com/knesset-data-pipelines/{}".format( storage_path) resource_name = pipeline_id if os.environ.get("DATASERVICE_LOAD_FROM_URL"): pipeline_steps = [ ('load_resource', { "url": "{}/datapackage.json".format(storage_url), "resource": resource_name }), ] else: pipeline_steps = [ ('..datapackage_pipelines_knesset.dataservice.processors.add_dataservice_collection_resource', pipeline["dataservice-parameters"]), ('..datapackage_pipelines_knesset.common.processors.throttle', { 'rows-per-page': 50 }), ] pipeline_steps += [( 'knesset.dump_to_path', { 'storage-url': storage_url, 'out-path': '../{}'.format(storage_path) }, )] dump_to_sql = 'knesset.dump_to_sql' table_name = '{}_{}'.format(pipeline['schemas-bucket'], pipeline_id.replace('-', '_')) pipeline_steps += [( dump_to_sql, { 'engine': 'env://DPP_DB_ENGINE', 'tables': { table_name: { 'resource-name': pipeline_id, 'mode': 'rewrite', } } }, )] yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
def get_all_package_pipeline(cls, pipeline_id, pipeline): pipeline_steps = [] for resource in pipeline["resources"]: pipeline_steps += [("load_resource", { "url": pipeline["base-url"] + resource["name"] + "/datapackage.json", "resource": resource.get("resource", resource["name"]) })] if resource.get("resource"): pipeline_steps += [("..rename_resource", { "src": resource["resource"], "dst": resource["name"] })] if resource.get('set_types'): pipeline_steps += [("set_types", { "resources": resource["name"], "types": resource['set_types'] })] pipeline_steps += [('dump.to_path', { 'out-path': pipeline["out-path"] })] pipeline_steps += [('dump.to_zip', { 'out-file': pipeline["out-path"] + "/datapackage.zip" })] assert pipeline['base-url'].startswith( 'https://storage.googleapis.com/knesset-data-pipelines/') storage_path = '{}all'.format(pipeline['base-url'].replace( 'https://storage.googleapis.com/knesset-data-pipelines/', '')) storage_url = "http://storage.googleapis.com/knesset-data-pipelines/{}".format( storage_path) pipeline_steps += [( 'knesset.dump_to_path', { 'storage-url': storage_url, 'out-path': '../{}'.format(storage_path) }, )] yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
def get_all_package_pipeline(cls, pipeline_id, pipeline): pipeline_steps = [] for resource in pipeline["resources"]: pipeline_steps += [("load_resource", { "url": pipeline["base-url"] + resource["name"] + "/datapackage.json", "resource": resource.get("resource", resource["name"]) })] if resource.get("resource"): pipeline_steps += [("..rename_resource", { "src": resource["resource"], "dst": resource["name"] })] pipeline_steps += [('dump.to_path', { 'out-path': pipeline["out-path"] })] pipeline_steps += [('dump.to_zip', { 'out-file': pipeline["out-path"] + "/datapackage.zip" })] yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
def planner_pipelines(): planner_gen = planner(input, flow_id, spec.get('processing', []), outputs, **config) inner_pipeline_id = None while True: inner_pipeline_id, pipeline_steps, dependencies, title, content_type = planner_gen.send( inner_pipeline_id) inner_pipeline_ids.append(inner_pipeline_id) pid_without_revision = inner_pipeline_id.replace( '/{}/'.format(revision), '/') pipeline_steps.insert(0, datahub_step) pipeline_steps.extend( dump_steps(pid_without_revision, content_type=content_type)) dependencies = [dict(pipeline='./' + d) for d in dependencies] pipeline = { 'pipeline': steps(*pipeline_steps), 'dependencies': dependencies, 'title': title } yield inner_pipeline_id, pipeline inner_pipeline_id = 'dependency://./' + inner_pipeline_id
def get_knesset_dataservice_pipeline(cls, pipeline_id, pipeline, base): storage_path = "data/{}/{}".format(pipeline['schemas-bucket'], pipeline_id) storage_url = "http://storage.googleapis.com/knesset-data-pipelines/{}".format( storage_path) if os.environ.get('KNESSET_PIPELINES_DATA_PATH'): storage_abspath = os.path.join( os.environ['KNESSET_PIPELINES_DATA_PATH'], pipeline['schemas-bucket'], pipeline_id) else: storage_abspath = None resource_name = pipeline_id pipeline_steps = [] if os.environ.get('KNESSET_LOAD_FROM_URL'): if 'dependencies' in pipeline: del pipeline['dependencies'] pipeline_steps += [ ('load_resource', { "url": "{}/datapackage.json".format(storage_url), "resource": '.*', 'log-progress-rows': 10000 }, True), ] else: for pre_step in pipeline.get('pre-steps', []): pipeline_steps.append( (pre_step['run'], pre_step.get('parameters', {}), pre_step.get('cache', False))) if os.environ.get("DATASERVICE_LOAD_FROM_URL"): pipeline_steps += [ ('load_resource', { "url": "{}/datapackage.json".format(storage_url), "resource": resource_name, 'log-progress-rows': 10000, 'limit-rows': pipeline['dataservice-parameters'].get('limit-rows') }, True), ] else: if ('incremental-field' in pipeline['dataservice-parameters'] and os.environ.get('KNESSET_DATASERVICE_INCREMENTAL')): if not storage_abspath: logging.error( 'please set KNESSET_PIPELINES_DATA_PATH env var to absolute path for the data directory to use incremental updates' ) exit(1) pipeline_steps += [('load_resource', { "url": "{}/datapackage.json".format(storage_abspath), 'required': False, "resources": { resource_name: { 'name': 'last_' + resource_name, 'path': 'last_' + resource_name + '.csv' } } })] pipeline_steps += [ ('..datapackage_pipelines_knesset.dataservice.processors.add_dataservice_collection_resource', pipeline["dataservice-parameters"]), ('..datapackage_pipelines_knesset.common.processors.throttle', { 'rows-per-page': 50, 'resource': resource_name }), ] if ('incremental-field' in pipeline['dataservice-parameters'] and os.environ.get('KNESSET_DATASERVICE_INCREMENTAL')): pipeline_steps += [('sort', { 'resources': resource_name, 'sort-by': '{' + pipeline['dataservice-parameters']['incremental-field'] + '}' })] for additional_step in pipeline.get('additional-steps', []): pipeline_steps.append((additional_step['run'], additional_step.get('parameters', {}), additional_step.get('cache', False))) pipeline_steps += [( 'knesset.dump_to_path', { 'storage-url': storage_url, 'out-path': '../{}'.format(storage_path) }, )] dump_to_sql = 'knesset.dump_to_sql' table_name = '{}_{}'.format(pipeline['schemas-bucket'], pipeline_id.replace('-', '_')) tables = {table_name: pipeline_id} tables.update(pipeline.get('additional-sql-tables', {})) tables = { table_name: { 'resource-name': resource_name, 'mode': 'rewrite' } for table_name, resource_name in tables.items() } pipeline_steps += [( dump_to_sql, { 'engine': 'env://DPP_DB_ENGINE', 'tables': tables }, )] output_pipeline = { 'pipeline': steps(*pipeline_steps), 'dependencies': pipeline.get('dependencies', []) } if pipeline.get('dependencies'): output_pipeline['dependencies'] = pipeline['dependencies'] else: output_pipeline['schedule'] = {'crontab': '10 1 * * *'} yield os.path.join(base, pipeline_id), output_pipeline
def generate_pipeline(cls, source): for doc_type, parameters in source.items(): if parameters['kind'] == 'indexer': snake_doc_type = doc_type.replace('-', '_') dependent_pipeline_id = parameters['dependent_pipeline'] source_datapackage = parameters['source_datapackage'] if os.environ.get("ES_LOAD_FROM_URL") == "1": # this allows to populate elasticsearch data without running dependant pipelines source_datapackage = source_datapackage.replace("/var/datapackages", "http://next.obudget.org/datapackages") key_fields = parameters.get('key-fields', []) page_title_pattern = parameters.get('page-title-pattern') key_pattern = '/'.join([doc_type] + ['{%s}' % f for f in key_fields]) key_pattern = parameters.get('key-pattern', key_pattern) pipeline_id = 'index_{}'.format(snake_doc_type) db_table = '_elasticsearch_mirror__{}'.format(snake_doc_type) revision = parameters.get('revision', 0) keep_history = parameters.get('keep-history', []) history_steps = [] for kh in keep_history: history_steps.extend( cls.history_steps(doc_type, key_fields, kh['fields'], kh.get('key')) ) date_range_parameters = parameters.get('date-range', {}) pipeline_steps = steps(*[ ('add_metadata', { 'name': pipeline_id, }), ('load_resource', { 'url': source_datapackage, 'resource': doc_type, })]) + parameters.get('extra-steps', []) + steps(*[ ('set-revision', {'revision': revision}), ('manage-revisions', { 'resource-name': doc_type, 'db-table': db_table, 'key-fields': key_fields }), ('dump.to_sql', { 'tables': { db_table: { 'resource-name': doc_type, 'mode': 'update' } } }), ('filter', { 'resources': doc_type, 'in': [ {'__next_update_days': 1}, # {'__next_update_days': 2}, ] }), ]) + history_steps + steps(*[ ('add_doc_id', { 'doc-id-pattern': key_pattern }), ('add_page_title', { 'page-title-pattern': page_title_pattern }), ('add_date_range', date_range_parameters), ('dump_to_es', { 'indexes': { 'budgetkey': [ {'resource-name': doc_type, 'doc-type': doc_type} ] } }), ('dpdumper', { 'out-path': '/var/datapackages/budgetkey/{}'.format(doc_type) }) ]) + parameters.get('document-steps', []) + steps(*[ ('convert_to_key_value' ), ('dump_to_es', { 'indexes': { 'budgetkey': [ {'resource-name': 'document', 'doc-type': 'document'} ] } }), ]) if os.environ.get("ES_LIMIT_ROWS"): dump_to_sql_indices = [i for i, s in enumerate(pipeline_steps) if s.get("run") == "dump.to_sql"] assert len(dump_to_sql_indices) > 0 pipeline_steps.insert( dump_to_sql_indices[0], {"run": "limit_rows", "parameters": {"stop-after-rows": int(os.environ.get("ES_LIMIT_ROWS"))}} ) pipeline = { 'dependencies': [ {'pipeline': dependent_pipeline_id} ], 'pipeline': pipeline_steps } if os.environ.get("ES_LOAD_FROM_URL") == "1": del pipeline["dependencies"] yield pipeline_id, pipeline
def get_db_dump_pipeline(cls, pipeline_id, pipeline, base): pipeline_steps = [ ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/members/presence/datapackage.json", "resource": "presence" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/knesset/kns_knessetdates/datapackage.json", "resource": "kns_knessetdates" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/votes/view_vote_mk_individual/datapackage.json", "resource": "view_vote_mk_individual" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/votes/view_vote_rslts_hdr_approved/datapackage.json", "resource": "view_vote_rslts_hdr_approved" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/votes/vote_result_type/datapackage.json", "resource": "vote_result_type" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/votes/vote_rslts_kmmbr_shadow/datapackage.json", "resource": "vote_rslts_kmmbr_shadow" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/committees/kns_committee/datapackage.json", "resource": "kns_committee" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/people/members/joined-mks/datapackage.json", "resource": "mk_individual" }), ("load_resource", { "url": "https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/committee-meeting-attendees-mks-stats/datapackage.json", "resource": "mk_attendance" }), # remove positions and altnames because oknesset DB doesn't support jsonb # TODO: normalize altnames and positions to mk_individual or other tables ("set_types", { "resources": "mk_individual", "types": { "positions": None, "altnames": None } }), ("dump.to_sql", { "engine": "env://DPP_DB_ENGINE", "tables": { "next_members_presence": { "resource-name": "presence", "mode": "rewrite" }, "next_kns_knessetdates": { "resource-name": "kns_knessetdates", "mode": "rewrite" }, "next_view_vote_mk_individual": { "resource-name": "view_vote_mk_individual", "mode": "rewrite" }, "next_view_vote_rslts_hdr_approved": { "resource-name": "view_vote_rslts_hdr_approved", "mode": "rewrite" }, "next_vote_result_type": { "resource-name": "vote_result_type", "mode": "rewrite" }, "next_vote_rslts_kmmbr_shadow": { "resource-name": "vote_rslts_kmmbr_shadow", "mode": "rewrite" }, "next_kns_committee": { "resource-name": "kns_committee", "mode": "rewrite" }, "next_mk_individual": { "resource-name": "mk_individual", "mode": "rewrite" }, "next_mk_attendance": { "resource-name": "mk_attendance", "mode": "rewrite" }, } }) ] yield os.path.join(base, pipeline_id), { 'pipeline': steps(*pipeline_steps), 'schedule': { 'crontab': '10 1 * * *' } }
def handle_issue(cls, issue, issue_policy): pipeline_id_format = issue_policy.get('pipeline-id-format', 'issue/{issue-id:03}_{title-slug}') pipeline_steps = steps(['github.waiting-for-implementation']) yield pipeline_id_format, pipeline_steps
def history_steps(cls, resource_name, primary_key, fields, history_key=None): assert len(set(primary_key).intersection(set(fields))) == 0 if history_key is None: history_key = '_'.join(sorted(fields)) db_table = 'history_{}_{}'.format(resource_name, history_key).replace('-', '_') target_resource_name = db_table return steps(*[ ('duplicate', { 'source': resource_name, 'target-name': target_resource_name, 'target-path': PATH_PLACEHOLDER }), ('concatenate', { 'target': { 'name': target_resource_name, 'path': PATH_PLACEHOLDER }, 'sources': target_resource_name, 'fields': dict((f, []) for f in primary_key + fields) }), ('add_timestamp', { 'resource': target_resource_name }), ('join', { 'source': { 'name': target_resource_name, 'key': primary_key + ['__updated_timestamp'], 'delete': True }, 'target': { 'name': target_resource_name, 'key': None }, 'fields': dict( (f, { 'aggregate': 'last' } if f in fields else None) for f in primary_key + ['__updated_timestamp'] + fields ) }), ('filter_updated_items', { 'db_table': db_table, 'resource': target_resource_name, 'key_fields': primary_key, 'value_fields': fields }), ('set_primary_key', { target_resource_name: primary_key + ['__updated_timestamp'] }), ('dump.to_sql', { 'tables': { db_table: { 'resource-name': target_resource_name, 'mode': 'update' } } }), ('drop_resource', { 'resource': target_resource_name }) ])
def generate_pipeline(cls, source): title = source['title'] dataset_name = source.get('dataset-name', title) dataset_name = slugify(dataset_name).lower() pipeline_id = dataset_name resource_name = source.get('resource-name', dataset_name) for data_source in source['sources']: if data_source['url'].endswith('.csv'): data_source['mediatype'] = 'text/csv' if 'name' not in data_source: data_source['name'] = slugify( os.path.basename(data_source['url']) ) model_params = { 'options': dict( (f['header'], f['options']) for f in source['fields'] if 'options' in f ), 'os-types': dict( (f['header'], f['osType']) for f in source['fields'] ), 'titles': dict( (f['header'], f['title']) for f in source['fields'] if 'title' in f ), } extra_measures = [] measure_handling = [] if 'measures' in source: measures = source['measures'] normalise_measures = ('fiscal.normalise_measures', { 'measures': measures['mapping'] }) if 'title' in measures: normalise_measures[1]['title'] = measures['title'] measure_handling.append(normalise_measures) model_params['os-types']['value'] = 'value' model_params['options']['value'] = { 'currency': measures['currency'] } extra_measures = [ (measure, []) for measure in source['measures']['mapping'].keys() ] if 'currency-conversion' in measures: currency_conversion = measures['currency-conversion'] date_measure = currency_conversion.get('date_measure') if date_measure is None: date_measure = [ f['header'] for f in source['fields'] if f.get('osType', '').startswith('date:') ][0] currencies = measures.get('currencies', ['USD']) normalise_currencies = ('fiscal.normalise_currencies', { 'measures': ['value'], 'date-field': date_measure, 'to-currencies': currencies, 'from-currency': measures['currency'] }) if 'title' in currency_conversion: normalise_currencies[1]['title'] = measures['title'] measure_handling.append(normalise_currencies) for currency in currencies: measure_name = 'value_{}'.format(currency) model_params['os-types'][measure_name] = 'value' model_params['options'][measure_name] = { 'currency': currency } dedpulicate_lines = source.get('deduplicate') is True dedpulicate_steps = [] if dedpulicate_lines: dedpulicate_steps.append(( 'set_types', { 'types': dict( (f['header'], dict( type='number', **f.get('options', {}) ) ) for f in source['fields'] if f['osType'] == 'value' ) } )) dedpulicate_steps.append(( 'join', { 'source': { 'name': resource_name, 'key': [ f['header'] for f in source['fields'] if f['osType'] != 'value' ], 'delete': True }, 'target': { 'name': resource_name, 'key': None }, 'fields': dict( (f['header'], { 'name': f['header'], 'aggregate': 'any' if f['osType'] != 'value' else 'sum' }) for f in source['fields'] ) } )) partial_output_file = '{}.fdp.partial.zip'.format(pipeline_id) output_file = '{}.fdp.zip'.format(pipeline_id) pipeline_steps = [ ( 'add_metadata', { 'title': title, 'name': dataset_name, } ) ] + [ ('add_resource', source) for source in source['sources'] ] + [ ('stream_remote_resources', {}, True), ('concatenate', { 'target': { 'name': resource_name }, 'fields': dict( [ (f['header'], f.get('aliases', [])) for f in source['fields'] ] + extra_measures ) }), ] + dedpulicate_steps + [ (step['processor'], step.get('parameters', {})) for step in source.get('postprocessing', []) ] + measure_handling + [ ('fiscal.model', model_params), ('dump.to_zip', { 'out-file': partial_output_file, }), ('fiscal.split_resource_per_fiscal_year_and_dump_to_zip', { 'in-file': partial_output_file, 'out-file': output_file, }), ('fiscal.upload', { 'in-file': output_file, 'publish': True }), ] pipeline_details = { 'pipeline': steps(*pipeline_steps), } yield pipeline_id, pipeline_details
def generate_pipeline(cls, source): pipeline_id = dataset_name = "estadisticasjudiciales" resources = [] # //find CSV files files = get_files("/mnt/datackan/provincias/", "csv") for f in files: obj = { "name": f["table"], "url": f["filename"], "format": "csv", "headers": 1 } # # logging.info("len(resources)") # logging.info(len(resources)) if len(resources) < 1: objlist = obj, True else: objlist = [obj] r = ["add_resource"] r += objlist resources += [r] logging.info("resources") logging.info(tuple(resources)) pipeline_steps = steps(*[ ("add_metadata", { "processed_by": "datapackage_pipelines_estadisticasjudiciales" }), tuple(resources), # # ['add_resource',{'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Dependencias.csv', 'format': 'csv', 'headers': 1}, True], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Listado1(2016).csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Listado1.csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Listado2(2016).csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Listado2.csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/TiposRoles.csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-listado1.csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-Listado1_1.csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-Listado1_2.csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-listado1_p1.csv', 'format': 'csv', 'headers': 1}], # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-listado1_p2.csv', 'format': 'csv', 'headers': 1}] # , ("stream_remote_resources", { "cache": True }), # dump to mysql # run tests ("dump.to_path", { "out-path": "testpath" }), # ("dump.to_mysql", { # "out-path": "testpath" # }), ]) pipeline_details = { "pipeline": pipeline_steps, "schedule": { "crontab": SCHEDULE_MONTHLY } } logging.info("pipeline_steps") logging.info(pipeline_steps) yield pipeline_id, pipeline_details
def generate_pipeline(cls, source, base): all_pipelines = [] sitemap_params = [] bumper = source.get('bumper', 0) today = datetime.date.today() weeks_bump = (today - REF_DATE).days // 7 bumper += weeks_bump for doc_type, parameters in source.items(): if not isinstance(parameters, dict): continue if 'kind' not in parameters: continue if parameters['kind'] == 'indexer': snake_doc_type = doc_type.replace('-', '_') dependent_pipeline_id = parameters['dependent_pipeline'] source_datapackage = parameters['source_datapackage'] if os.environ.get("ES_LOAD_FROM_URL") == "1": # this allows to populate elasticsearch data without running dependant pipelines source_datapackage = source_datapackage.replace("/var/datapackages", "http://next.obudget.org/datapackages") key_fields = parameters.get('key-fields', []) page_title_pattern = parameters.get('page-title-pattern') key_pattern = '/'.join([doc_type] + ['{%s}' % f for f in key_fields]) key_pattern = parameters.get('key-pattern', key_pattern) pipeline_id = os.path.join(base, 'index_{}'.format(snake_doc_type)) db_table = '_elasticsearch_mirror__{}'.format(snake_doc_type) revision = parameters.get('revision', 0) + bumper if doc_type != 'people': all_pipelines.append(pipeline_id) sitemap_params.append({ 'kind': doc_type, 'db-table': db_table, 'doc-id': key_pattern, 'page-title': page_title_pattern }) keep_history = parameters.get('keep-history', []) history_steps = [] for kh in keep_history: history_steps.extend( cls.history_steps(doc_type, key_fields, kh['fields'], kh.get('key')) ) date_range_parameters = parameters.get('date-range', {}) pipeline_steps = steps(*[ ('update_package', { 'name': pipeline_id, }), ('load_big', { 'from': source_datapackage, 'resource': doc_type, })]) + parameters.get('extra-steps', []) + steps(*[ ('set-revision', {'revision': revision}), ('manage-revisions', { 'resource-name': doc_type, 'db-table': db_table, 'key-fields': key_fields }), ('dump.to_sql', { 'tables': { db_table: { 'resource-name': doc_type, 'mode': 'update' } } }), ('set-revisions', {}), ('filter', { 'resources': doc_type, 'in': [ {'__next_update_days': 1}, # {'__next_update_days': 2}, ] }), ]) + history_steps + steps(*[ ('add_doc_id', { 'doc-id-pattern': key_pattern }), ('add_page_title', { 'page-title-pattern': page_title_pattern }), ('add_date_range', date_range_parameters), ('dump_to_es', { 'indexes': { 'budgetkey': [ {'resource-name': doc_type, 'doc-type': doc_type, 'revision': revision} ] } }), ('dpdumper', { 'out-path': '/var/datapackages/budgetkey/{}'.format(doc_type) }) ]) + parameters.get('document-steps', []) + steps(*[ ('convert_to_key_value' ), ('dump_to_es', { 'indexes': { 'budgetkey': [ {'resource-name': 'document', 'doc-type': 'document'} ] } }), ]) if os.environ.get("ES_LIMIT_ROWS"): dump_to_sql_indices = [i for i, s in enumerate(pipeline_steps) if s.get("run") == "dump.to_sql"] assert len(dump_to_sql_indices) > 0 pipeline_steps.insert( dump_to_sql_indices[0], {"run": "limit_rows", "parameters": {"stop-after-rows": int(os.environ.get("ES_LIMIT_ROWS"))}} ) pipeline = { 'dependencies': [ {'pipeline': dependent_pipeline_id} ], 'pipeline': pipeline_steps } if os.environ.get("ES_LOAD_FROM_URL") == "1": del pipeline["dependencies"] yield pipeline_id, pipeline sitemaps_pipeline = { 'dependencies': [ {'pipeline': pipeline_id} for pipeline_id in all_pipelines ], 'pipeline': steps(*[ ('build_sitemaps', params) for params in sitemap_params ] + [ ('build_sitemaps_index', {}) ]) } yield os.path.join(base, 'sitemaps'), sitemaps_pipeline
def _plan(revision, spec, **config): """Plan a flow according to spec""" meta = spec['meta'] flow_id = '{ownerid}/{dataset}/{revision}'.format(**meta, revision=revision) dataset_id = '{ownerid}/{dataset}'.format(**meta) ownerid = meta['ownerid'] dataset = meta['dataset'] owner = meta.get('owner') findability = meta.get('findability', 'published') acl = 'public-read' if findability == 'private': acl = 'private' update_time = meta.get('update_time') create_time = meta.get('create_time') inputs = spec.get('inputs', []) assert len(inputs) == 1, 'Only supporting one input atm' input = inputs[0] assert input[ 'kind'] == 'datapackage', 'Only supporting datapackage inputs atm' inner_pipeline_ids = [] outputs = spec.get('outputs', []) zip_there = any(output['kind'] == 'zip' for output in outputs) if not zip_there: zip_output = { 'kind': 'zip', 'parameters': { 'out-file': '%s.zip' % (meta['dataset']) } } outputs.append(zip_output) datahub_step = ('assembler.update_metadata', { 'ownerid': ownerid, 'owner': owner, 'findability': findability, 'flowid': flow_id, 'modified': update_time, 'created': create_time, 'id': dataset_id }) def planner_pipelines(): planner_gen = planner(input, flow_id, spec.get('processing', []), outputs, **config) inner_pipeline_id = None while True: inner_pipeline_id, pipeline_steps, dependencies, title, content_type = planner_gen.send( inner_pipeline_id) inner_pipeline_ids.append(inner_pipeline_id) pid_without_revision = inner_pipeline_id.replace( '/{}/'.format(revision), '/') pipeline_steps.insert(0, datahub_step) pipeline_steps.extend( dump_steps(pid_without_revision, content_type=content_type)) dependencies = [dict(pipeline='./' + d) for d in dependencies] pipeline = { 'pipeline': steps(*pipeline_steps), 'dependencies': dependencies, 'title': title } yield inner_pipeline_id, pipeline inner_pipeline_id = 'dependency://./' + inner_pipeline_id yield from planner_pipelines() dependencies = [dict(pipeline='./' + pid) for pid in inner_pipeline_ids] datapackage_descriptor = input['parameters']['descriptor'] final_steps = [ ('add_metadata', dict((k, v) for k, v in datapackage_descriptor.items() if k != 'resources')), datahub_step, ('assembler.load_modified_resources', { 'urls': dependencies }), ] final_steps.extend( dump_steps(flow_id, content_type='application/json', final=True)) if not os.environ.get('PLANNER_LOCAL'): final_steps.append(('aws.change_acl', { 'bucket': os.environ['PKGSTORE_BUCKET'], 'path': '{}/{}'.format(ownerid, dataset), 'acl': acl })) pipeline = { 'update_time': update_time, 'dependencies': dependencies, 'pipeline': steps(*final_steps), 'title': 'Creating Package' } # print('yielding', pipeline_id(), pipeline) yield flow_id, pipeline