def get_knesset_dataservice_pipeline(cls, pipeline_id, pipeline):
     if os.environ.get("DATASERVICE_LOAD_FROM_URL"):
         pipeline_steps = [
             ('load_resource', {
                 "url":
                 "http://storage.googleapis.com/knesset-data-pipelines/data/{}/{}/datapackage.json"
                 .format(pipeline['schemas-bucket'], pipeline_id),
                 "resource":
                 pipeline_id
             }),
         ]
     else:
         pipeline_steps = [
             ('..datapackage_pipelines_knesset.dataservice.processors.add_dataservice_collection_resource',
              pipeline["dataservice-parameters"]),
             ('..datapackage_pipelines_knesset.common.processors.throttle',
              {
                  'rows-per-page': 50
              }),
         ]
     pipeline_steps += [('dump.to_path', {
         'out-path':
         '../data/{}/{}'.format(pipeline['schemas-bucket'], pipeline_id)
     })]
     yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
Esempio n. 2
0
    def generate_pipeline(cls, source):
        project_id = slugify(source['project'])
        schedule = SCHEDULE_DAILY

        discovered_steps = cls._get_pipeline_steps()

        for k, config in source['config'].items():
            # `k` corresponds with `label` in pipeline_steps module.
            if k in discovered_steps.keys():
                pipeline_id = slugify('{}-{}'.format(project_id, k))

                common_steps = [('add_metadata', {
                    'project': project_id,
                    'name': pipeline_id
                })]

                k_steps = discovered_steps[k](common_steps, pipeline_id,
                                              project_id, config)
                _steps = steps(*k_steps)
            else:
                log.warn('No {} pipeline generator available for {}'.format(
                    k, project_id))
                continue

            pipeline_details = {'pipeline': _steps}
            if schedule is not None:
                pipeline_details['schedule'] = {'crontab': schedule}

            yield pipeline_id, pipeline_details
    def generate_pipeline(cls, source, base):

        all_pipeline_ids = []

        for flow in FLOWS:
            for pipeline_steps, deps, suffix in flow(source, base):
                pipeline_id = base + '/' + flow.__name__
                if suffix:
                    pipeline_id += '_' + suffix
                pipeline_details = {
                    'pipeline':
                    steps(*pipeline_steps),
                    'dependencies':
                    [dict(pipeline=base + '/' + dep) for dep in deps]
                }
                all_pipeline_ids.append(pipeline_id)
                yield pipeline_id, pipeline_details

        if not source.get('suppress-os', False):
            for flow in OS_FLOWS:
                for pipeline_steps, deps, suffix in flow(source, base):
                    pipeline_id = base + '/' + flow.__name__
                    if suffix:
                        pipeline_id += '_' + suffix
                    pipeline_details = {
                        'pipeline':
                        steps(*pipeline_steps),
                        'dependencies':
                        [dict(pipeline=base + '/' + dep) for dep in deps]
                    }
                    all_pipeline_ids.append(pipeline_id)
                    yield pipeline_id, pipeline_details

        # clean up dependencies if keep-artifacts is not True.
        if not source.get('keep-artifacts', False):
            dirs_to_clean = ["denormalized", "normalized", "final"]
            pipeline_id = base + '/' + 'cleanup-dependencies'
            pipeline_details = {
                'pipeline':
                steps(('fiscal.cleanup-dependencies', {
                    'dirs_to_clean': dirs_to_clean
                })),
                'dependencies': [{
                    'pipeline': dep
                } for dep in all_pipeline_ids]
            }
            yield pipeline_id, pipeline_details
 def get_db_dump_pipeline(cls, pipeline_id, pipeline):
     pipeline_steps = [
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/votes/votes/datapackage.json",
             "resource": "votes"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/committees/kns_committee/datapackage.json",
             "resource": "kns_committee"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/people/members/joined-mks/datapackage.json",
             "resource": "mk_individual"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/committee-meeting-attendees-mks-stats/datapackage.json",
             "resource": "mk_attendance"
         }),
         # remove positions and altnames because oknesset DB doesn't support jsonb
         # TODO: normalize altnames and positions to mk_individual or other tables
         ("set_types", {
             "resources": "mk_individual",
             "types": {
                 "positions": None,
                 "altnames": None
             }
         }),
         ("dump.to_sql", {
             "engine": "env://DPP_DB_ENGINE",
             "tables": {
                 "next_votes": {
                     "resource-name": "votes",
                     "mode": "rewrite"
                 },
                 "next_kns_committee": {
                     "resource-name": "kns_committee",
                     "mode": "rewrite"
                 },
                 "next_mk_individual": {
                     "resource-name": "mk_individual",
                     "mode": "rewrite"
                 },
                 "next_mk_attendance": {
                     "resource-name": "mk_attendance",
                     "mode": "rewrite"
                 },
             }
         })
     ]
     yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
 def filter_pipeline(cls, pipeline_id, pipeline):
     if pipeline.get("pipeline-type") == "knesset dataservice":
         yield from cls.get_knesset_dataservice_pipeline(
             pipeline_id, pipeline)
     elif pipeline.get("pipeline-type") == "all package":
         yield from cls.get_all_package_pipeline(pipeline_id, pipeline)
     else:
         pipeline["pipeline"] = steps(*[(step["run"],
                                         step.get("parameters", {}))
                                        for step in pipeline["pipeline"]])
         yield pipeline_id, pipeline
    def generate_pipeline(cls, source, wp):
        pipeline_id = dataset_name = slugify(source['name'])
        host = source['udata-instance']
        action = source['data-kind']

        if action == 'datasets-list':
            schedule = SCHEDULE_MONTHLY
            pipeline_steps = steps(*[('udata.catalog', {
                'udata-instance': host
            }), ('add_metadata', {
                'name': dataset_name
            }), ('dump.to_zip', {
                'out-file': 'udata-list.zip'
            })])

            pipeline_details = {
                'pipeline': pipeline_steps,
                'schedule': {
                    'crontab': schedule
                }
            }

            yield pipeline_id, pipeline_details

        if action == 'dataset':

            pipeline_steps = steps(*[('udata.fetch_metadata', {
                'host': source['udata-instance'],
                'kind': 'dataset',
                'id': source['dataset']
            }), ('add_metadata', {
                'name': source['name']
            }),
                                     ('dump.to_path', {
                                         'handle-non-tabular': 'true',
                                         'pretty-descriptor': 'true'
                                     })])

            pipeline_details = {'pipeline': pipeline_steps}

            yield pipeline_id, pipeline_details
Esempio n. 7
0
 def get_all_package_pipeline(cls, pipeline_id, pipeline, base):
     assert pipeline['base-url'].startswith(
         'https://storage.googleapis.com/knesset-data-pipelines/')
     base_path = pipeline['base-url'].replace(
         'https://storage.googleapis.com/knesset-data-pipelines/', '')
     pipeline_steps = []
     dependencies = []
     for resource in pipeline["resources"]:
         pipeline_steps += [("load_resource", {
             "url":
             '../' + base_path + resource["name"] + "/datapackage.json",
             "resource":
             resource.get("resource", resource["name"])
         })]
         dependencies.append({
             'datapackage':
             base_path + resource["name"] + "/datapackage.json"
         })
         if resource.get("resource"):
             pipeline_steps += [("..rename_resource", {
                 "src": resource["resource"],
                 "dst": resource["name"]
             })]
         if resource.get('set_types'):
             pipeline_steps += [("set_types", {
                 "resources": resource["name"],
                 "types": resource['set_types']
             })]
     # pipeline_steps += [('dump.to_path',
     #                     {'out-path': pipeline["out-path"]})]
     pipeline_steps += [('dump.to_zip', {
         'out-file': pipeline["out-path"] + "/datapackage.zip",
         'pretty-descriptor': True
     })]
     storage_path = '{}all'.format(pipeline['base-url'].replace(
         'https://storage.googleapis.com/knesset-data-pipelines/', ''))
     storage_url = "http://storage.googleapis.com/knesset-data-pipelines/{}".format(
         storage_path)
     pipeline_steps += [(
         'knesset.dump_to_path',
         {
             'storage-url': storage_url,
             'out-path': '../{}'.format(storage_path)
         },
     )]
     yield os.path.join(base, pipeline_id), {
         'pipeline': steps(*pipeline_steps),
         'schedule': {
             'crontab': '10 1 * * *'
         },
         'dependencies': dependencies
     }
Esempio n. 8
0
    def generate_pipeline(cls, source):
        pipeline = []
        for action in source.get('actions', []):
            uuid, verb, options = action['uuid'], action['verb'], action['options']

            def step(processor, params):
                params['uuid'] = uuid
                params['revision'] = options['revision']
                return (processor,
                        params,
                        True)

            if verb == 'source':
                pipeline.append(step('datapipes.load_source',
                                     {'url': options['url'], 'res_name': uuid}))
            elif verb == 'skip':
                if options['kind'] == 'rows':
                    pipeline.append(step('datapipes.skip_rows',
                                         {'amount': options['amount']}))
                elif options['kind'] == 'columns':
                    pipeline.append(step('datapipes.skip_columns',
                                         {'amount': options['amount']}))
            elif verb == 'mutate':
                pipeline.append(step('datapipes.mutate',
                                     {'field': options['field'],
                                      'options': options['options']}))
            elif verb == 'filter':
                pipeline.append(step('datapipes.filter',
                                     {
                                         'field': options['field'],
                                         'op': options['op'],
                                         'arg': options['arg'],
                                     }
                                ))
                pipeline.append(step('datapipes.noop', {}))
            elif verb == 'headers':
                pipeline.append(step('datapipes.take_headers', {}))
            elif verb == 'noop':
                pipeline.append(step('datapipes.noop', {}))

        pipeline.append(('datapipes.noop', {'uuid': 'last'}, False))

        yield 'dp', {
            'pipeline':
                steps(
                    ('datapipes.init', ),
                    *pipeline,
                )
        }
Esempio n. 9
0
 def get_knesset_dataservice_pipeline(cls, pipeline_id, pipeline):
     storage_path = "data/{}/{}".format(pipeline['schemas-bucket'],
                                        pipeline_id)
     storage_url = "http://storage.googleapis.com/knesset-data-pipelines/{}".format(
         storage_path)
     resource_name = pipeline_id
     if os.environ.get("DATASERVICE_LOAD_FROM_URL"):
         pipeline_steps = [
             ('load_resource', {
                 "url": "{}/datapackage.json".format(storage_url),
                 "resource": resource_name
             }),
         ]
     else:
         pipeline_steps = [
             ('..datapackage_pipelines_knesset.dataservice.processors.add_dataservice_collection_resource',
              pipeline["dataservice-parameters"]),
             ('..datapackage_pipelines_knesset.common.processors.throttle',
              {
                  'rows-per-page': 50
              }),
         ]
     pipeline_steps += [(
         'knesset.dump_to_path',
         {
             'storage-url': storage_url,
             'out-path': '../{}'.format(storage_path)
         },
     )]
     dump_to_sql = 'knesset.dump_to_sql'
     table_name = '{}_{}'.format(pipeline['schemas-bucket'],
                                 pipeline_id.replace('-', '_'))
     pipeline_steps += [(
         dump_to_sql,
         {
             'engine': 'env://DPP_DB_ENGINE',
             'tables': {
                 table_name: {
                     'resource-name': pipeline_id,
                     'mode': 'rewrite',
                 }
             }
         },
     )]
     yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
Esempio n. 10
0
 def get_all_package_pipeline(cls, pipeline_id, pipeline):
     pipeline_steps = []
     for resource in pipeline["resources"]:
         pipeline_steps += [("load_resource", {
             "url":
             pipeline["base-url"] + resource["name"] + "/datapackage.json",
             "resource":
             resource.get("resource", resource["name"])
         })]
         if resource.get("resource"):
             pipeline_steps += [("..rename_resource", {
                 "src": resource["resource"],
                 "dst": resource["name"]
             })]
         if resource.get('set_types'):
             pipeline_steps += [("set_types", {
                 "resources": resource["name"],
                 "types": resource['set_types']
             })]
     pipeline_steps += [('dump.to_path', {
         'out-path': pipeline["out-path"]
     })]
     pipeline_steps += [('dump.to_zip', {
         'out-file':
         pipeline["out-path"] + "/datapackage.zip"
     })]
     assert pipeline['base-url'].startswith(
         'https://storage.googleapis.com/knesset-data-pipelines/')
     storage_path = '{}all'.format(pipeline['base-url'].replace(
         'https://storage.googleapis.com/knesset-data-pipelines/', ''))
     storage_url = "http://storage.googleapis.com/knesset-data-pipelines/{}".format(
         storage_path)
     pipeline_steps += [(
         'knesset.dump_to_path',
         {
             'storage-url': storage_url,
             'out-path': '../{}'.format(storage_path)
         },
     )]
     yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
Esempio n. 11
0
 def get_all_package_pipeline(cls, pipeline_id, pipeline):
     pipeline_steps = []
     for resource in pipeline["resources"]:
         pipeline_steps += [("load_resource", {
             "url":
             pipeline["base-url"] + resource["name"] + "/datapackage.json",
             "resource":
             resource.get("resource", resource["name"])
         })]
         if resource.get("resource"):
             pipeline_steps += [("..rename_resource", {
                 "src": resource["resource"],
                 "dst": resource["name"]
             })]
     pipeline_steps += [('dump.to_path', {
         'out-path': pipeline["out-path"]
     })]
     pipeline_steps += [('dump.to_zip', {
         'out-file':
         pipeline["out-path"] + "/datapackage.zip"
     })]
     yield pipeline_id, {'pipeline': steps(*pipeline_steps)}
Esempio n. 12
0
    def planner_pipelines():
        planner_gen = planner(input, flow_id, spec.get('processing', []),
                              outputs, **config)
        inner_pipeline_id = None
        while True:
            inner_pipeline_id, pipeline_steps, dependencies, title, content_type = planner_gen.send(
                inner_pipeline_id)
            inner_pipeline_ids.append(inner_pipeline_id)

            pid_without_revision = inner_pipeline_id.replace(
                '/{}/'.format(revision), '/')

            pipeline_steps.insert(0, datahub_step)
            pipeline_steps.extend(
                dump_steps(pid_without_revision, content_type=content_type))
            dependencies = [dict(pipeline='./' + d) for d in dependencies]

            pipeline = {
                'pipeline': steps(*pipeline_steps),
                'dependencies': dependencies,
                'title': title
            }
            yield inner_pipeline_id, pipeline
            inner_pipeline_id = 'dependency://./' + inner_pipeline_id
Esempio n. 13
0
 def get_knesset_dataservice_pipeline(cls, pipeline_id, pipeline, base):
     storage_path = "data/{}/{}".format(pipeline['schemas-bucket'],
                                        pipeline_id)
     storage_url = "http://storage.googleapis.com/knesset-data-pipelines/{}".format(
         storage_path)
     if os.environ.get('KNESSET_PIPELINES_DATA_PATH'):
         storage_abspath = os.path.join(
             os.environ['KNESSET_PIPELINES_DATA_PATH'],
             pipeline['schemas-bucket'], pipeline_id)
     else:
         storage_abspath = None
     resource_name = pipeline_id
     pipeline_steps = []
     if os.environ.get('KNESSET_LOAD_FROM_URL'):
         if 'dependencies' in pipeline:
             del pipeline['dependencies']
         pipeline_steps += [
             ('load_resource', {
                 "url": "{}/datapackage.json".format(storage_url),
                 "resource": '.*',
                 'log-progress-rows': 10000
             }, True),
         ]
     else:
         for pre_step in pipeline.get('pre-steps', []):
             pipeline_steps.append(
                 (pre_step['run'], pre_step.get('parameters', {}),
                  pre_step.get('cache', False)))
         if os.environ.get("DATASERVICE_LOAD_FROM_URL"):
             pipeline_steps += [
                 ('load_resource', {
                     "url":
                     "{}/datapackage.json".format(storage_url),
                     "resource":
                     resource_name,
                     'log-progress-rows':
                     10000,
                     'limit-rows':
                     pipeline['dataservice-parameters'].get('limit-rows')
                 }, True),
             ]
         else:
             if ('incremental-field' in pipeline['dataservice-parameters']
                     and os.environ.get('KNESSET_DATASERVICE_INCREMENTAL')):
                 if not storage_abspath:
                     logging.error(
                         'please set KNESSET_PIPELINES_DATA_PATH env var to absolute path for the data directory to use incremental updates'
                     )
                     exit(1)
                 pipeline_steps += [('load_resource', {
                     "url":
                     "{}/datapackage.json".format(storage_abspath),
                     'required':
                     False,
                     "resources": {
                         resource_name: {
                             'name': 'last_' + resource_name,
                             'path': 'last_' + resource_name + '.csv'
                         }
                     }
                 })]
             pipeline_steps += [
                 ('..datapackage_pipelines_knesset.dataservice.processors.add_dataservice_collection_resource',
                  pipeline["dataservice-parameters"]),
                 ('..datapackage_pipelines_knesset.common.processors.throttle',
                  {
                      'rows-per-page': 50,
                      'resource': resource_name
                  }),
             ]
             if ('incremental-field' in pipeline['dataservice-parameters']
                     and os.environ.get('KNESSET_DATASERVICE_INCREMENTAL')):
                 pipeline_steps += [('sort', {
                     'resources':
                     resource_name,
                     'sort-by':
                     '{' +
                     pipeline['dataservice-parameters']['incremental-field']
                     + '}'
                 })]
         for additional_step in pipeline.get('additional-steps', []):
             pipeline_steps.append((additional_step['run'],
                                    additional_step.get('parameters', {}),
                                    additional_step.get('cache', False)))
     pipeline_steps += [(
         'knesset.dump_to_path',
         {
             'storage-url': storage_url,
             'out-path': '../{}'.format(storage_path)
         },
     )]
     dump_to_sql = 'knesset.dump_to_sql'
     table_name = '{}_{}'.format(pipeline['schemas-bucket'],
                                 pipeline_id.replace('-', '_'))
     tables = {table_name: pipeline_id}
     tables.update(pipeline.get('additional-sql-tables', {}))
     tables = {
         table_name: {
             'resource-name': resource_name,
             'mode': 'rewrite'
         }
         for table_name, resource_name in tables.items()
     }
     pipeline_steps += [(
         dump_to_sql,
         {
             'engine': 'env://DPP_DB_ENGINE',
             'tables': tables
         },
     )]
     output_pipeline = {
         'pipeline': steps(*pipeline_steps),
         'dependencies': pipeline.get('dependencies', [])
     }
     if pipeline.get('dependencies'):
         output_pipeline['dependencies'] = pipeline['dependencies']
     else:
         output_pipeline['schedule'] = {'crontab': '10 1 * * *'}
     yield os.path.join(base, pipeline_id), output_pipeline
Esempio n. 14
0
    def generate_pipeline(cls, source):
        for doc_type, parameters in source.items():
            if parameters['kind'] == 'indexer':
                snake_doc_type = doc_type.replace('-', '_')
                dependent_pipeline_id = parameters['dependent_pipeline']
                source_datapackage = parameters['source_datapackage']
                if os.environ.get("ES_LOAD_FROM_URL") == "1":
                    # this allows to populate elasticsearch data without running dependant pipelines
                    source_datapackage = source_datapackage.replace("/var/datapackages", "http://next.obudget.org/datapackages")
                key_fields = parameters.get('key-fields', [])
                page_title_pattern = parameters.get('page-title-pattern')
                key_pattern = '/'.join([doc_type] + ['{%s}' % f for f in key_fields])
                key_pattern = parameters.get('key-pattern', key_pattern)
                pipeline_id = 'index_{}'.format(snake_doc_type)
                db_table = '_elasticsearch_mirror__{}'.format(snake_doc_type)
                revision = parameters.get('revision', 0)
                keep_history = parameters.get('keep-history', [])
                history_steps = []
                for kh in keep_history:
                    history_steps.extend(
                        cls.history_steps(doc_type, key_fields, kh['fields'], kh.get('key'))
                    )
                date_range_parameters = parameters.get('date-range', {})

                pipeline_steps = steps(*[
                    ('add_metadata', {
                        'name': pipeline_id,
                    }),
                    ('load_resource', {
                        'url': source_datapackage,
                        'resource': doc_type,
                    })]) + parameters.get('extra-steps', []) + steps(*[
                    ('set-revision', {'revision': revision}),
                    ('manage-revisions', {
                        'resource-name': doc_type,
                        'db-table': db_table,
                        'key-fields': key_fields
                    }),
                    ('dump.to_sql', {
                        'tables': {
                            db_table: {
                                'resource-name': doc_type,
                                'mode': 'update'
                            }
                        }
                    }),
                    ('filter', {
                        'resources': doc_type,
                        'in': [
                            {'__next_update_days': 1},
                            # {'__next_update_days': 2},
                        ]
                    }),
                ]) + history_steps + steps(*[
                    ('add_doc_id', {
                        'doc-id-pattern': key_pattern
                    }),
                    ('add_page_title', {
                        'page-title-pattern': page_title_pattern
                    }),
                    ('add_date_range', date_range_parameters),
                    ('dump_to_es', {
                        'indexes': {
                            'budgetkey': [
                                {'resource-name': doc_type,
                                 'doc-type': doc_type}
                            ]
                        }
                    }),
                    ('dpdumper', {
                        'out-path': '/var/datapackages/budgetkey/{}'.format(doc_type)
                    })                    
                ]) + parameters.get('document-steps', []) + steps(*[                   
                    ('convert_to_key_value'
                    ),
                    ('dump_to_es', {
                        'indexes': {
                            'budgetkey': [
                                {'resource-name': 'document',
                                 'doc-type': 'document'}
                            ]
                        }
                    }),
                ])

                if os.environ.get("ES_LIMIT_ROWS"):
                    dump_to_sql_indices = [i for i, s in enumerate(pipeline_steps) if s.get("run") == "dump.to_sql"]
                    assert len(dump_to_sql_indices) > 0
                    pipeline_steps.insert(
                        dump_to_sql_indices[0],
                        {"run": "limit_rows", "parameters": {"stop-after-rows": int(os.environ.get("ES_LIMIT_ROWS"))}}
                    )

                pipeline = {
                    'dependencies': [
                        {'pipeline': dependent_pipeline_id}
                    ],
                    'pipeline': pipeline_steps
                }
                if os.environ.get("ES_LOAD_FROM_URL") == "1":
                    del pipeline["dependencies"]
                yield pipeline_id, pipeline
Esempio n. 15
0
 def get_db_dump_pipeline(cls, pipeline_id, pipeline, base):
     pipeline_steps = [
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/members/presence/datapackage.json",
             "resource": "presence"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/knesset/kns_knessetdates/datapackage.json",
             "resource": "kns_knessetdates"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/votes/view_vote_mk_individual/datapackage.json",
             "resource": "view_vote_mk_individual"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/votes/view_vote_rslts_hdr_approved/datapackage.json",
             "resource": "view_vote_rslts_hdr_approved"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/votes/vote_result_type/datapackage.json",
             "resource": "vote_result_type"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/votes/vote_rslts_kmmbr_shadow/datapackage.json",
             "resource": "vote_rslts_kmmbr_shadow"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/committees/kns_committee/datapackage.json",
             "resource": "kns_committee"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/people/members/joined-mks/datapackage.json",
             "resource": "mk_individual"
         }),
         ("load_resource", {
             "url":
             "https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/committee-meeting-attendees-mks-stats/datapackage.json",
             "resource": "mk_attendance"
         }),
         # remove positions and altnames because oknesset DB doesn't support jsonb
         # TODO: normalize altnames and positions to mk_individual or other tables
         ("set_types", {
             "resources": "mk_individual",
             "types": {
                 "positions": None,
                 "altnames": None
             }
         }),
         ("dump.to_sql", {
             "engine": "env://DPP_DB_ENGINE",
             "tables": {
                 "next_members_presence": {
                     "resource-name": "presence",
                     "mode": "rewrite"
                 },
                 "next_kns_knessetdates": {
                     "resource-name": "kns_knessetdates",
                     "mode": "rewrite"
                 },
                 "next_view_vote_mk_individual": {
                     "resource-name": "view_vote_mk_individual",
                     "mode": "rewrite"
                 },
                 "next_view_vote_rslts_hdr_approved": {
                     "resource-name": "view_vote_rslts_hdr_approved",
                     "mode": "rewrite"
                 },
                 "next_vote_result_type": {
                     "resource-name": "vote_result_type",
                     "mode": "rewrite"
                 },
                 "next_vote_rslts_kmmbr_shadow": {
                     "resource-name": "vote_rslts_kmmbr_shadow",
                     "mode": "rewrite"
                 },
                 "next_kns_committee": {
                     "resource-name": "kns_committee",
                     "mode": "rewrite"
                 },
                 "next_mk_individual": {
                     "resource-name": "mk_individual",
                     "mode": "rewrite"
                 },
                 "next_mk_attendance": {
                     "resource-name": "mk_attendance",
                     "mode": "rewrite"
                 },
             }
         })
     ]
     yield os.path.join(base, pipeline_id), {
         'pipeline': steps(*pipeline_steps),
         'schedule': {
             'crontab': '10 1 * * *'
         }
     }
Esempio n. 16
0
 def handle_issue(cls, issue, issue_policy):
     pipeline_id_format = issue_policy.get('pipeline-id-format', 'issue/{issue-id:03}_{title-slug}')
     pipeline_steps = steps(['github.waiting-for-implementation'])
     yield pipeline_id_format, pipeline_steps
Esempio n. 17
0
 def history_steps(cls, resource_name, primary_key, fields, history_key=None):
     assert len(set(primary_key).intersection(set(fields))) == 0
     if history_key is None:
         history_key = '_'.join(sorted(fields))
     db_table = 'history_{}_{}'.format(resource_name, history_key).replace('-', '_')
     target_resource_name = db_table
     return steps(*[
         ('duplicate', {
             'source': resource_name,
             'target-name': target_resource_name,
             'target-path': PATH_PLACEHOLDER
         }),
         ('concatenate', {
             'target': {
                 'name': target_resource_name,
                 'path': PATH_PLACEHOLDER
             },
             'sources': target_resource_name,
             'fields': dict((f, []) for f in primary_key + fields) 
         }),
         ('add_timestamp', {
             'resource': target_resource_name
         }),
         ('join', {
             'source': {
                 'name': target_resource_name,
                 'key': primary_key + ['__updated_timestamp'],
                 'delete': True
             },
             'target': {
                 'name': target_resource_name,
                 'key': None
             },
             'fields': dict(
                 (f, {
                     'aggregate': 'last'
                 } if f in fields else None)
                 for f in primary_key + ['__updated_timestamp'] + fields
             )
         }), 
         ('filter_updated_items', {
             'db_table': db_table,
             'resource': target_resource_name,
             'key_fields': primary_key,
             'value_fields': fields
         }),
         ('set_primary_key', {
             target_resource_name: primary_key + ['__updated_timestamp']
         }),
         ('dump.to_sql', {
             'tables': {
                 db_table: {
                     'resource-name': target_resource_name,
                     'mode': 'update'
                 }
             }
         }),
         ('drop_resource', {
             'resource': target_resource_name
         })
     ])
Esempio n. 18
0
    def generate_pipeline(cls, source):
        title = source['title']
        dataset_name = source.get('dataset-name', title)
        dataset_name = slugify(dataset_name).lower()
        pipeline_id = dataset_name
        resource_name = source.get('resource-name', dataset_name)

        for data_source in source['sources']:
            if data_source['url'].endswith('.csv'):
                data_source['mediatype'] = 'text/csv'
            if 'name' not in data_source:
                data_source['name'] = slugify(
                    os.path.basename(data_source['url'])
                )

        model_params = {
            'options': dict(
                (f['header'], f['options'])
                for f in source['fields']
                if 'options' in f
            ),
            'os-types': dict(
                (f['header'], f['osType'])
                for f in source['fields']
            ),
            'titles': dict(
                (f['header'], f['title'])
                for f in source['fields']
                if 'title' in f
            ),
        }
        extra_measures = []
        measure_handling = []
        if 'measures' in source:
            measures = source['measures']
            normalise_measures = ('fiscal.normalise_measures', {
                'measures': measures['mapping']
            })
            if 'title' in measures:
                normalise_measures[1]['title'] = measures['title']
            measure_handling.append(normalise_measures)
            model_params['os-types']['value'] = 'value'
            model_params['options']['value'] = {
                'currency': measures['currency']
            }
            extra_measures = [
                (measure, [])
                for measure in source['measures']['mapping'].keys()
            ]
            if 'currency-conversion' in measures:
                currency_conversion = measures['currency-conversion']
                date_measure = currency_conversion.get('date_measure')
                if date_measure is None:
                    date_measure = [
                        f['header']
                        for f in source['fields']
                        if f.get('osType', '').startswith('date:')
                    ][0]
                currencies = measures.get('currencies', ['USD'])
                normalise_currencies = ('fiscal.normalise_currencies', {
                    'measures': ['value'],
                    'date-field': date_measure,
                    'to-currencies': currencies,
                    'from-currency': measures['currency']
                })
                if 'title' in currency_conversion:
                    normalise_currencies[1]['title'] = measures['title']
                measure_handling.append(normalise_currencies)
                for currency in currencies:
                    measure_name = 'value_{}'.format(currency)
                    model_params['os-types'][measure_name] = 'value'
                    model_params['options'][measure_name] = {
                        'currency': currency
                    }

        dedpulicate_lines = source.get('deduplicate') is True
        dedpulicate_steps = []
        if dedpulicate_lines:
            dedpulicate_steps.append((
                'set_types',
                {
                    'types': dict(
                        (f['header'],
                         dict(
                            type='number',
                            **f.get('options', {})
                         )
                        )
                        for f in source['fields']
                        if f['osType'] == 'value'
                    )
                }
            ))
            dedpulicate_steps.append((
                'join',
                {
                    'source': {
                        'name': resource_name,
                        'key': [
                            f['header']
                            for f in source['fields']
                            if f['osType'] != 'value'
                        ],
                        'delete': True
                    },
                    'target': {
                        'name': resource_name,
                        'key': None
                    },
                    'fields': dict(
                        (f['header'],
                         {
                             'name': f['header'],
                             'aggregate': 'any' if f['osType'] != 'value' else 'sum'
                         })
                        for f in source['fields']
                    )
                }
            ))


        partial_output_file = '{}.fdp.partial.zip'.format(pipeline_id)
        output_file = '{}.fdp.zip'.format(pipeline_id)
        pipeline_steps = [
            (
                'add_metadata',
                {
                   'title': title,
                   'name': dataset_name,
                }
            )
        ] + [
            ('add_resource', source)
            for source in source['sources']
        ] + [
            ('stream_remote_resources', {}, True),
            ('concatenate', {
                'target': {
                    'name': resource_name
                },
                'fields': dict(
                    [
                        (f['header'], f.get('aliases', []))
                        for f in source['fields']
                    ] + extra_measures
                )
            }),
        ] + dedpulicate_steps + [
            (step['processor'], step.get('parameters', {}))
            for step in source.get('postprocessing', [])
        ] + measure_handling + [
            ('fiscal.model', model_params),
            ('dump.to_zip', {
                'out-file': partial_output_file,
            }),
            ('fiscal.split_resource_per_fiscal_year_and_dump_to_zip', {
                'in-file': partial_output_file,
                'out-file': output_file,
            }),
            ('fiscal.upload', {
                'in-file': output_file,
                'publish': True
            }),
        ]

        pipeline_details = {
            'pipeline': steps(*pipeline_steps),
        }
        yield pipeline_id, pipeline_details
Esempio n. 19
0
    def generate_pipeline(cls, source):
        pipeline_id = dataset_name = "estadisticasjudiciales"

        resources = []

        # //find CSV files
        files = get_files("/mnt/datackan/provincias/", "csv")
        for f in files:
            obj = {
                "name": f["table"],
                "url": f["filename"],
                "format": "csv",
                "headers": 1
            }
            #
            # logging.info("len(resources)")
            # logging.info(len(resources))

            if len(resources) < 1:
                objlist = obj, True
            else:
                objlist = [obj]

            r = ["add_resource"]
            r += objlist
            resources += [r]

        logging.info("resources")
        logging.info(tuple(resources))

        pipeline_steps = steps(*[
            ("add_metadata", {
                "processed_by": "datapackage_pipelines_estadisticasjudiciales"
            }),
            tuple(resources),
            #
            # ['add_resource',{'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Dependencias.csv', 'format': 'csv', 'headers': 1}, True],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Listado1(2016).csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Listado1.csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Listado2(2016).csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/Listado2.csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-03-MPF/TiposRoles.csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-listado1.csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-Listado1_1.csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-Listado1_2.csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-listado1_p1.csv', 'format': 'csv', 'headers': 1}],
            # ['add_resource', {'name': 'table', 'url': '/mnt/datackan/provincias/ARG-09-MPF/prueba unificacion/ARG-09-MPF-listado1_p2.csv', 'format': 'csv', 'headers': 1}]
            # ,
            ("stream_remote_resources", {
                "cache": True
            }),
            # dump to mysql
            # run tests
            ("dump.to_path", {
                "out-path": "testpath"
            }),
            # ("dump.to_mysql", {
            #     "out-path": "testpath"
            # }),
        ])

        pipeline_details = {
            "pipeline": pipeline_steps,
            "schedule": {
                "crontab": SCHEDULE_MONTHLY
            }
        }
        logging.info("pipeline_steps")
        logging.info(pipeline_steps)
        yield pipeline_id, pipeline_details
Esempio n. 20
0
    def generate_pipeline(cls, source, base):
        all_pipelines = []
        sitemap_params = []
        bumper = source.get('bumper', 0)
        today = datetime.date.today()
        weeks_bump = (today - REF_DATE).days // 7
        bumper += weeks_bump
        for doc_type, parameters in source.items():
            if not isinstance(parameters, dict):
                continue
            if 'kind' not in parameters:
                continue
            if parameters['kind'] == 'indexer':
                snake_doc_type = doc_type.replace('-', '_')
                dependent_pipeline_id = parameters['dependent_pipeline']
                source_datapackage = parameters['source_datapackage']
                if os.environ.get("ES_LOAD_FROM_URL") == "1":
                    # this allows to populate elasticsearch data without running dependant pipelines
                    source_datapackage = source_datapackage.replace("/var/datapackages", "http://next.obudget.org/datapackages")
                key_fields = parameters.get('key-fields', [])
                page_title_pattern = parameters.get('page-title-pattern')
                key_pattern = '/'.join([doc_type] + ['{%s}' % f for f in key_fields])
                key_pattern = parameters.get('key-pattern', key_pattern)
                pipeline_id = os.path.join(base, 'index_{}'.format(snake_doc_type))
                db_table = '_elasticsearch_mirror__{}'.format(snake_doc_type)
                revision = parameters.get('revision', 0) + bumper

                if doc_type != 'people':
                    all_pipelines.append(pipeline_id)
                    sitemap_params.append({
                        'kind': doc_type,
                        'db-table': db_table,
                        'doc-id': key_pattern,
                        'page-title': page_title_pattern
                    })

                keep_history = parameters.get('keep-history', [])
                history_steps = []
                for kh in keep_history:
                    history_steps.extend(
                        cls.history_steps(doc_type, key_fields, kh['fields'], kh.get('key'))
                    )
                date_range_parameters = parameters.get('date-range', {})

                pipeline_steps = steps(*[
                    ('update_package', {
                        'name': pipeline_id,
                    }),
                    ('load_big', {
                        'from': source_datapackage,
                        'resource': doc_type,
                    })]) + parameters.get('extra-steps', []) + steps(*[
                    ('set-revision', {'revision': revision}),
                    ('manage-revisions', {
                        'resource-name': doc_type,
                        'db-table': db_table,
                        'key-fields': key_fields
                    }),
                    ('dump.to_sql', {
                        'tables': {
                            db_table: {
                                'resource-name': doc_type,
                                'mode': 'update'
                            }
                        }
                    }),
                    ('set-revisions', {}),
                    ('filter', {
                        'resources': doc_type,
                        'in': [
                            {'__next_update_days': 1},
                            # {'__next_update_days': 2},
                        ]
                    }),
                ]) + history_steps + steps(*[
                    ('add_doc_id', {
                        'doc-id-pattern': key_pattern
                    }),
                    ('add_page_title', {
                        'page-title-pattern': page_title_pattern
                    }),
                    ('add_date_range', date_range_parameters),
                    ('dump_to_es', {
                        'indexes': {
                            'budgetkey': [
                                {'resource-name': doc_type,
                                 'doc-type': doc_type,
                                 'revision': revision}
                            ]
                        }
                    }),
                    ('dpdumper', {
                        'out-path': '/var/datapackages/budgetkey/{}'.format(doc_type)
                    })                    
                ]) + parameters.get('document-steps', []) + steps(*[                   
                    ('convert_to_key_value'
                    ),
                    ('dump_to_es', {
                        'indexes': {
                            'budgetkey': [
                                {'resource-name': 'document',
                                 'doc-type': 'document'}
                            ]
                        }
                    }),
                ])

                if os.environ.get("ES_LIMIT_ROWS"):
                    dump_to_sql_indices = [i for i, s in enumerate(pipeline_steps) if s.get("run") == "dump.to_sql"]
                    assert len(dump_to_sql_indices) > 0
                    pipeline_steps.insert(
                        dump_to_sql_indices[0],
                        {"run": "limit_rows", "parameters": {"stop-after-rows": int(os.environ.get("ES_LIMIT_ROWS"))}}
                    )

                pipeline = {
                    'dependencies': [
                        {'pipeline': dependent_pipeline_id}
                    ],
                    'pipeline': pipeline_steps
                }
                if os.environ.get("ES_LOAD_FROM_URL") == "1":
                    del pipeline["dependencies"]
                yield pipeline_id, pipeline
        
        sitemaps_pipeline = {
            'dependencies': [
                {'pipeline': pipeline_id}
                for pipeline_id in all_pipelines
            ],
            'pipeline': steps(*[
                ('build_sitemaps', params)
                for params in sitemap_params
            ] + [
                ('build_sitemaps_index', {})
            ])
        }
        yield os.path.join(base, 'sitemaps'), sitemaps_pipeline
Esempio n. 21
0
def _plan(revision, spec, **config):
    """Plan a flow according to spec"""
    meta = spec['meta']

    flow_id = '{ownerid}/{dataset}/{revision}'.format(**meta,
                                                      revision=revision)
    dataset_id = '{ownerid}/{dataset}'.format(**meta)

    ownerid = meta['ownerid']
    dataset = meta['dataset']
    owner = meta.get('owner')

    findability = meta.get('findability', 'published')
    acl = 'public-read'
    if findability == 'private':
        acl = 'private'

    update_time = meta.get('update_time')
    create_time = meta.get('create_time')

    inputs = spec.get('inputs', [])
    assert len(inputs) == 1, 'Only supporting one input atm'

    input = inputs[0]
    assert input[
        'kind'] == 'datapackage', 'Only supporting datapackage inputs atm'

    inner_pipeline_ids = []

    outputs = spec.get('outputs', [])
    zip_there = any(output['kind'] == 'zip' for output in outputs)
    if not zip_there:
        zip_output = {
            'kind': 'zip',
            'parameters': {
                'out-file': '%s.zip' % (meta['dataset'])
            }
        }
        outputs.append(zip_output)

    datahub_step = ('assembler.update_metadata', {
        'ownerid': ownerid,
        'owner': owner,
        'findability': findability,
        'flowid': flow_id,
        'modified': update_time,
        'created': create_time,
        'id': dataset_id
    })

    def planner_pipelines():
        planner_gen = planner(input, flow_id, spec.get('processing', []),
                              outputs, **config)
        inner_pipeline_id = None
        while True:
            inner_pipeline_id, pipeline_steps, dependencies, title, content_type = planner_gen.send(
                inner_pipeline_id)
            inner_pipeline_ids.append(inner_pipeline_id)

            pid_without_revision = inner_pipeline_id.replace(
                '/{}/'.format(revision), '/')

            pipeline_steps.insert(0, datahub_step)
            pipeline_steps.extend(
                dump_steps(pid_without_revision, content_type=content_type))
            dependencies = [dict(pipeline='./' + d) for d in dependencies]

            pipeline = {
                'pipeline': steps(*pipeline_steps),
                'dependencies': dependencies,
                'title': title
            }
            yield inner_pipeline_id, pipeline
            inner_pipeline_id = 'dependency://./' + inner_pipeline_id

    yield from planner_pipelines()

    dependencies = [dict(pipeline='./' + pid) for pid in inner_pipeline_ids]
    datapackage_descriptor = input['parameters']['descriptor']
    final_steps = [
        ('add_metadata',
         dict((k, v) for k, v in datapackage_descriptor.items()
              if k != 'resources')),
        datahub_step,
        ('assembler.load_modified_resources', {
            'urls': dependencies
        }),
    ]
    final_steps.extend(
        dump_steps(flow_id, content_type='application/json', final=True))
    if not os.environ.get('PLANNER_LOCAL'):
        final_steps.append(('aws.change_acl', {
            'bucket': os.environ['PKGSTORE_BUCKET'],
            'path': '{}/{}'.format(ownerid, dataset),
            'acl': acl
        }))
    pipeline = {
        'update_time': update_time,
        'dependencies': dependencies,
        'pipeline': steps(*final_steps),
        'title': 'Creating Package'
    }
    # print('yielding', pipeline_id(), pipeline)
    yield flow_id, pipeline