Example #1
0
def __(pipeline: pipelines.Pipeline):
    return bootstrap.card(
        header_left='Nodes',
        header_right=[
            bootstrap.button(
                id='run-with-upstreams-button',
                label='Run with upstreams',
                icon='play',
                url=flask.url_for('data_integration.run_page',
                                  path=pipeline.url_path(),
                                  with_upstreams=True),
                title=
                f'Run selected nodes with all their upstreams in pipeline "{pipeline.id}"'
            ), '    ',
            bootstrap.button(id='run-button',
                             label='Run ',
                             icon='play',
                             url=flask.url_for('data_integration.run_page',
                                               path=pipeline.url_path(),
                                               with_upstreams=False),
                             title='Run selected nodes')
        ] if config.allow_run_from_web_ui() else [],
        body=html.asynchronous_content(
            url=flask.url_for('data_integration.pipeline_children_table',
                              path=pipeline.url_path())))
Example #2
0
def bc_reg_pipeline_bn_credential_load():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_bn_loader',
        description=
        'A pipeline that creates BN credentials for all existing corporations.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_existing_corps_no_bn',
        description='Load BC Reg corps with no BN credential')
    sub_pipeline1_2.add(
        Task(id='register_un_bned_corps',
             description='Register corps with no BN',
             commands=[ExecutePython('./bcreg/find-un-bned-corps.py')]))
    sub_pipeline1_2.add(
        Task(id='load_corp_bn_data',
             description='Load BN credentials from company data',
             commands=[
                 ExecutePython('./bcreg/process-corps-generate-bn-creds.py')
             ]), ['register_un_bned_corps'])
    pipeline1.add(sub_pipeline1_2)

    return pipeline1
Example #3
0
    def run_pipeline_and_notify(pipeline: pipelines.Pipeline,
                                nodes: {pipelines.Node} = None):
        if config.slack_token():
            import requests, os

            message = (
                ':hatching_chick: *' +
                (os.environ.get('SUDO_USER') or os.environ.get('USER')
                 or os.getlogin()) + '* manually triggered run of ' +
                ('pipeline <' + config.base_url() + '/' +
                 '/'.join(pipeline.path()) + '|' + '/'.join(pipeline.path()) +
                 ' >' if pipeline.parent else 'root pipeline'))

            if nodes:
                message += ', nodes ' + ', '.join(
                    [f'`{node.id}`' for node in nodes])

            requests.post('https://hooks.slack.com/services/' +
                          config.slack_token(),
                          json={'text': message})

        if not run_pipeline(pipeline, nodes):
            requests.post('https://hooks.slack.com/services/' +
                          config.slack_token(),
                          json={'text': ':baby_chick: failed'})
            sys.exit(-1)
        requests.post('https://hooks.slack.com/services/' +
                      config.slack_token(),
                      json={'text': ':hatched_chick: succeeded'})
Example #4
0
def bc_reg_pipeline_initial_load():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_corp_loader',
        description=
        'A pipeline that does the initial data load and credentials for all corporations.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_bc_reg_corps',
        description='Load Active BC Reg corps and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_corps',
             description='Register un-processed active corps',
             commands=[
                 ExecutePython('./bcreg/find-unprocessed-corps_actve.py')
             ]))
    sub_pipeline1_2.add(
        Task(id='load_bc_reg_data_a',
             description='Load BC Registries data',
             commands=[
                 ExecutePython('./bcreg/process-corps-generate-creds.py')
             ]), ['register_un_processed_corps'])
    pipeline1.add(sub_pipeline1_2)

    return pipeline1
def bc_reg_pipeline_status():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_pipeline_status',
        description='Display overall event processing status.')

    pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status',
                        commands=[ExecutePython('./bcreg/display_pipeline_status.py')]))

    return pipeline
def bc_reg_populate_audit_table():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_populate_audit_table',
        description='Populate Audit Table.')

    pipeline.add(Task(id='populate_audit_table', description='Populate audit table.',
                        commands=[ExecutePython('./bcreg/populate_audit_table.py')]))

    return pipeline
def bc_init_test_data():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_test_data',
        description='A pipeline that initializes event processor database for testing.')

    pipeline.add(Task(id='register_test_corps', description='Insert some test data for processing',
                        commands=[ExecutePython('./bcreg/insert-test.py')]))

    return pipeline
def bc_reg_test_corps():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_test_corps',
        description='A pipeline that queues up a small set of test corporations.')

    pipeline.add(Task(id='register_test_corps', description='Register some test corps for processing',
                        commands=[ExecutePython('./bcreg/find-test-corps.py')]))

    return pipeline
Example #9
0
def von_list_mongo_data():
    import von_pipeline

    pipeline = Pipeline(id='von_list_mongo_data',
                        description='A pipeline that lists data in mongodb.')

    pipeline.add(
        Task(id='list_mongo_data',
             description='List data queued for processing',
             commands=[ExecutePython('./von_pipeline/list_mongo_data.py')]))

    return pipeline
Example #10
0
def euro_exchange_rates_pipeline(db_alias: str):
    pipeline = Pipeline(
        id="load_euro_exchange_rates",
        description=
        "Loads daily Euro exchange rates since 1999 from the European central bank",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add(
        Task(id="create_schema_and_table",
             description="Re-creates currency exchange rate schema",
             commands=[
                 ExecuteSQL(sql_file_name='create_schema_and_table.sql',
                            echo_queries=False)
             ]))

    pipeline.add(Task(
        id='load_exchange_rate',
        description='Loads exchange rates from the European central bank',
        commands=[
            ReadScriptOutput(file_name='load_exchange_rate.py',
                             target_table='euro_fx.exchange_rate',
                             db_alias='mdwh-etl')
        ]),
                 upstreams=['create_schema_and_table'])

    pipeline.add(Task(id="postprocess_exchange_rate",
                      description="Adds values for missing days",
                      commands=[
                          ExecuteSQL(
                              sql_file_name='postprocess_exchange_rate.sql',
                              echo_queries=False)
                      ]),
                 upstreams=['load_exchange_rate'])

    return pipeline
Example #11
0
def von_data_pipeline_status():
    import von_pipeline

    pipeline = Pipeline(id='von_data_pipeline_status',
                        description='Display overall event processing status.')

    pipeline.add(
        Task(id='display_pipeline_status',
             description=
             'Display status of the overall pipeline processing status',
             commands=[
                 ExecutePython('./von_pipeline/display_pipeline_status.py')
             ]))

    return pipeline
Example #12
0
def __(pipeline: pipelines.Pipeline):
    return [
        response.ActionButton(action=flask.url_for('data_integration.run_page',
                                                   path=pipeline.url_path(),
                                                   with_upstreams=False),
                              label='Run',
                              icon='play',
                              title='Run the pipeline')
    ]
def bc_reg_pipeline_post_credentials():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a', description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(Task(id='submit_credentials_a', description='Submit credentials',
                          commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    return pipeline1
Example #14
0
def root_pipeline():
    import app.data_integration.pipelines.github
    import app.data_integration.pipelines.pypi
    import app.data_integration.pipelines.utils
    import app.data_integration.pipelines.python_projects

    pipeline = Pipeline(
        id='mara_example_project',
        description=
        'An example pipeline that integrates PyPI download stats with the Github activity of a project'
    )

    pipeline.add(app.data_integration.pipelines.utils.pipeline)
    pipeline.add(app.data_integration.pipelines.pypi.pipeline,
                 upstreams=['utils'])
    pipeline.add(app.data_integration.pipelines.github.pipeline,
                 upstreams=['utils'])
    pipeline.add(app.data_integration.pipelines.python_projects.pipeline,
                 upstreams=['pypi', 'github'])
    return pipeline
def add_schema_copying_to_pipeline(pipeline: Pipeline,
                                   schema_name,
                                   source_db_alias: str,
                                   target_db_alias: str,
                                   max_number_of_parallel_tasks: int = 4):
    """
    Adds schema copying to the end of a pipeline.

    When the pipeline already has a final node, then the all except the last command are run before the copying,
    and the last command after.

    Args:
        pipeline: The pipeline to modify
        schema_name: The schema to copy
        source_db_alias: The alias of the PostgreSQL database to copy from
        target_db_alias: The alias of the PostgreSQL database to copy to
        max_number_of_parallel_tasks: How many operations to run at parallel at max.
    """
    task_id = "copy_schema"
    description = f"Copies the {schema_name} schema to the {target_db_alias} db"
    commands = []
    if pipeline.final_node:
        assert (isinstance(pipeline.final_node, Task))
        description = pipeline.final_node.description + ' + ' + description
        task_id = pipeline.final_node.id + '_and_' + task_id
        commands = pipeline.final_node.commands
        pipeline.remove(pipeline.final_node)

    pipeline.add_final(
        ParallelCopySchema(
            id=task_id,
            description=description,
            schema_name=schema_name,
            source_db_alias=source_db_alias,
            target_db_alias=target_db_alias,
            max_number_of_parallel_tasks=max_number_of_parallel_tasks,
            commands_before=commands[:-1],
            commands_after=commands[-1:]))
Example #16
0
def von_data_pipeline_post_credentials():
    import von_pipeline

    pipeline1 = Pipeline(
        id='von_data_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(
        id='submit_von_data_credentials_a',
        description='Submit von_data credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_a',
             description='Submit credentials',
             commands=[ExecutePython('./von_pipeline/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    return pipeline1
def bc_reg_pipeline_post_credentials():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_credential_poster',
        description='A pipeline that posts generated credentials to TOB.')

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a', description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(Task(id='submit_credentials_a', description='Submit credentials',
                          commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3)

    # sub_pipeline1_4 = Pipeline(id='populate_evp_audit_table_a', description='Populate Event Processor Audit Table')
    # sub_pipeline1_4.add(Task(id='populate_audit_table_a', description='Populate Audit Table',
    #                       commands=[ExecutePython('./bcreg/populate_audit_table.py')]))
    # pipeline1.add(sub_pipeline1_4, ['submit_bc_reg_credentials_a'])

    return pipeline1
def db_init_pipeline():
    import bcreg

    pipeline = Pipeline(
      id = 'bc_reg_db_init',
      description = 'Initialize BC Registries Event Processor database')

    pipeline.add(Task(id='create_tables', description='Create event processing tables',
                        commands=[ExecutePython('./bcreg/create.py')]))
    pipeline.add(Task(id='initialize_tables', description='Insert configuration data',
                        commands=[ExecutePython('./bcreg/insert.py')]), ['create_tables'])

    return pipeline
Example #19
0
def utils_pipeline(with_hll=False, with_cstore_fdw=False):
    pipeline = Pipeline(
        id="initialize_utils",
        description=
        "Creates an utils schema with a number of functions around the ETL best practices of Project A",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add_initial(
        Task(id="create_utils_schema",
             description="Re-creates the utils schema",
             commands=[
                 ExecuteSQL(
                     sql_statement=
                     "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;")
             ]))

    pipeline.add(
        Task(id='chunking',
             description='Runs file chunking.sql',
             commands=[
                 ExecuteSQL(sql_file_name='chunking.sql',
                            echo_queries=False,
                            replace={
                                'number_of_chunks':
                                lambda: config.number_of_chunks()
                            })
             ]))

    def add_task_for_file(file_name_without_extension):
        pipeline.add(
            Task(id=file_name_without_extension,
                 description=f'Runs file "{file_name_without_extension}.sql"',
                 commands=[
                     ExecuteSQL(sql_file_name=file_name_without_extension +
                                '.sql',
                                echo_queries=False)
                 ]))

    for file_name_without_extension in [
            'consistency_checks', 'data_sets', 'partitioning',
            'indexes_and_constraints', 'schema_switching', 'enums'
    ]:
        add_task_for_file(file_name_without_extension)

    if with_hll:
        add_task_for_file('hll')

    if with_cstore_fdw:
        add_task_for_file('cstore_fdw')

    return pipeline
def bc_reg_pipeline_status():
    import bcreg

    pipeline = Pipeline(
        id='bc_reg_pipeline_status',
        description='Display overall event processing status.')

    pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status',
                        commands=[ExecutePython('./bcreg/display_pipeline_status.py')]))
    # remove these from the pipeline due to issues connecting to DB's on openshift
    #pipeline.add(Task(id='display_pipeline_stats', description='Display stats of each stage in the pipeline processing',
    #                    commands=[ExecutePython('./bcreg/display_processed_corps_counts.py')]))
    pipeline.add(Task(id='display_event_processor_stats', description='Display stats of each event processor stage',
                        commands=[ExecutePython('./bcreg/display_event_processor_counts.py')]))

    return pipeline
Example #21
0
def db_init_pipeline():
    import von_pipeline

    pipeline = Pipeline(
        id='von_data_db_init',
        description='Initialize von_data Event Processor database')

    pipeline.add(
        Task(id='create_tables',
             description='Create event processing tables',
             commands=[ExecutePython('./von_pipeline/create.py')]))
    pipeline.add(
        Task(id='initialize_tables',
             description='Insert configuration data',
             commands=[ExecutePython('./von_pipeline/insert.py')]),
        ['create_tables'])

    return pipeline
Example #22
0
def von_root_pipeline():

    parent_pipeline = Pipeline(
        id='holder_for_pipeline_versions',
        description=
        'Holder for the different versions of the VON Data Pipeline.')

    parent_pipeline.add(von_data_pipeline())
    parent_pipeline.add(von_data_pipeline_status())

    init_pipeline = Pipeline(
        id='initialization_and_load_tasks',
        description='One-time initialization and data load tasks')

    init_pipeline.add(db_init_pipeline())
    init_pipeline.add(von_data_pipeline_initial_load())
    init_pipeline.add(von_data_pipeline_post_credentials())

    parent_pipeline.add(init_pipeline)

    test_pipeline = Pipeline(id='test_and_demo_tasks',
                             description='Holder for test and demo tasks.')

    test_pipeline.add(von_data_init_test_data())
    test_pipeline.add(von_data_test_registrations())
    test_pipeline.add(von_data_pipeline_single_thread())

    parent_pipeline.add(test_pipeline)

    return parent_pipeline
Example #23
0
def bc_reg_pipeline():
    import bcreg

    pipeline1 = Pipeline(
        id='bc_reg_event_processor',
        description=
        'A pipeline that processes BC Registries events and generates credentials.'
    )

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_bc_reg_data',
        description='Load BC Reg data and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_events',
             description='Register un-processed events',
             commands=[ExecutePython('./bcreg/find-unprocessed-events.py')]))
    sub_pipeline1_2.add(
        Task(id='load_bc_reg_data',
             description='Load BC Registries data',
             commands=[ExecutePython('./bcreg/process-corps.py')]),
        ['register_un_processed_events'])
    sub_pipeline1_2.add(
        Task(id='create_bc_reg_credentials',
             description='Create credentials',
             commands=[ExecutePython('./bcreg/generate-creds.py')]),
        ['load_bc_reg_data'])
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials',
                               description='Submit BC Reg credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials',
             description='Submit credentials',
             commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline1.add(sub_pipeline1_3, ['load_and_process_bc_reg_data'])

    return pipeline1
Example #24
0
def bc_reg_pipeline_jsonbender():
    import bcreg

    pipeline2 = Pipeline(
        id='bc_reg_event_processor_json_transform_demo',
        description=
        'A demo pipeline that processes events and generates credentials using JSONBender.'
    )

    sub_pipeline2_2 = Pipeline(
        id='load_and_process_bc_reg_data',
        description='Load BC Reg data and generate credentials')
    sub_pipeline2_2.add(
        Task(id='register_un_processed_events',
             description='Register un-processed events',
             commands=[ExecutePython('./bcreg/find-unprocessed-events.py')]))
    sub_pipeline2_2.add(
        Task(id='load_bc_reg_data',
             description='Load BC Registries data',
             commands=[ExecutePython('./bcreg/process-corps.py')]),
        ['register_un_processed_events'])
    sub_pipeline2_2.add(
        Task(id='create_credentials_jsonbender',
             description='Create credentials using JSONBender transform',
             commands=[ExecutePython('./bcreg/generate-creds-bender.py')]),
        ['load_bc_reg_data'])
    pipeline2.add(sub_pipeline2_2)

    sub_pipeline2_3 = Pipeline(id='submit_bc_reg_credentials',
                               description='Submit BC Reg credentials to P-X')
    sub_pipeline2_3.add(
        Task(id='submit_credentials',
             description='Submit credentials',
             commands=[ExecutePython('./bcreg/submit-creds.py')]))
    pipeline2.add(sub_pipeline2_3, ['load_and_process_bc_reg_data'])

    return pipeline2
Example #25
0
def run_pipeline(pipeline: pipelines.Pipeline,
                 nodes: {pipelines.Node} = None,
                 with_upstreams: bool = False) -> [events.Event]:
    """
    Runs a pipeline in a forked sub process. Acts as a generator that yields events from the sub process.

    Using forking has two advantages:
    1. The pipeline is also forked and thus can be modified without affecting the original pipeline.
    2. It's possible to hand over control to the parent process while the pipeline is running, for example
       for sending output to a browser.

    Args:
        pipeline: The pipeline to run
        nodes: A list of pipeline children that should run
        with_upstreams: When true and `nodes` are provided, then all upstreams of `nodes` in `pipeline` are also run
    Yields:
        Events emitted during pipeline execution
    """
    # A queue for receiving events from forked sub processes
    event_queue = multiprocessing.Queue()

    # The function that is run in a sub process
    def run():

        # collect system stats in a separate Process
        statistics_process = multiprocessing.Process(
            target=lambda: system_statistics.generate_system_statistics(
                event_queue),
            name='system_statistics')
        statistics_process.start()

        try:
            # capture output of print statements and other unplanned output
            logger.redirect_output(event_queue, pipeline.path())

            # all nodes that have not run yet, ordered by priority
            node_queue: [pipelines.Node] = []

            # data needed for computing cost
            node_durations_and_run_times = node_cost.node_durations_and_run_times(
                pipeline.path())

            # Putting nodes into the node queue
            def queue(nodes: [pipelines.Node]):
                for node in nodes:
                    node_cost.compute_cost(node, node_durations_and_run_times)
                    node_queue.append(node)
                node_queue.sort(key=lambda node: node.cost, reverse=True)

            if nodes:  # only run a set of child nodes

                def with_all_upstreams(nodes: {pipelines.Node}):
                    """recursively find all upstreams of a list of nodes"""
                    return functools.reduce(
                        set.union,
                        [with_all_upstreams(node.upstreams)
                         for node in nodes], nodes)

                # when requested, include all upstreams of nodes, otherwise just use provided nodes
                nodes_to_run = with_all_upstreams(
                    set(nodes)) if with_upstreams else set(nodes)

                # remove everything from pipeline that should not be run
                # (that's makes updating dependencies between nodes easier)
                for node in set(pipeline.nodes.values()) - nodes_to_run:
                    pipeline.remove(node)

                # queue remaining nodes
                queue(list((pipeline.nodes).values()))

            else:
                # remove dependencies to siblings
                pipeline.upstreams = set()
                pipeline.downstreams = set()
                # queue whole pipeline
                queue([pipeline])

            # book keeping
            run_start_time = datetime.datetime.now()
            # all nodes that already ran or that won't be run anymore
            processed_nodes: {pipelines.Node} = set()
            # running pipelines with start times and number of running children
            running_pipelines: {
                pipelines.Pipeline: [datetime.datetime, int]
            } = {}
            failed_pipelines: {pipelines.Pipeline
                               } = set()  # pipelines with failed tasks
            running_task_processes: {pipelines.Task: TaskProcess} = {}

            def dequeue() -> pipelines.Node:
                """
                Finds the next task in the queue
                - without upstreams or where all upstreams have been run already
                - where the pipeline specific maximum number of parallel tasks per pipeline is not reached
                """
                for node in node_queue:  # type: pipelines.Node
                    if ((not node.upstreams
                         or len(node.upstreams & processed_nodes) == len(
                             node.upstreams)) and
                        (not isinstance(node.parent, pipelines.Pipeline) or
                         (not node.parent.max_number_of_parallel_tasks) or
                         (not node.parent in running_pipelines) or
                         (running_pipelines[node.parent][1] <
                          node.parent.max_number_of_parallel_tasks))):
                        node_queue.remove(node)
                        if node.parent in failed_pipelines:
                            processed_nodes.add(
                                node
                            )  # if the parent pipeline failed, don't launch new nodes
                        else:
                            return node

            def track_finished_pipelines():
                """when all nodes of a pipeline have been processed, then emit events"""
                for running_pipeline, (start_time, running_children) \
                        in dict(running_pipelines).items():  # type: pipelines.Pipeline
                    if len(
                            set(running_pipeline.nodes.values())
                            & processed_nodes) == len(running_pipeline.nodes):
                        succeeded = running_pipeline not in failed_pipelines
                        event_queue.put(
                            events.Output(
                                node_path=running_pipeline.path(),
                                format=logger.Format.ITALICS,
                                is_error=not succeeded,
                                message=
                                f'{"succeeded" if succeeded else "failed"}, {logger.format_time_difference(run_start_time, datetime.datetime.now())}'
                            ))
                        event_queue.put(
                            events.NodeFinished(
                                node_path=running_pipeline.path(),
                                start_time=start_time,
                                end_time=datetime.datetime.now(),
                                is_pipeline=True,
                                succeeded=succeeded))
                        del running_pipelines[running_pipeline]
                        processed_nodes.add(running_pipeline)

            # announce run start
            event_queue.put(
                events.RunStarted(node_path=pipeline.path(),
                                  start_time=run_start_time,
                                  pid=os.getpid()))

            # run as long
            # - as task processes are still running
            # - as there is still stuff in the node queue
            while running_task_processes or node_queue:
                # don't do anything if the maximum number of parallel tasks is currently running
                if len(running_task_processes
                       ) < config.max_number_of_parallel_tasks():

                    next_node = dequeue(
                    )  # get the next runnable node from the queue

                    if next_node:
                        if isinstance(next_node, pipelines.Pipeline):
                            # connect pipeline nodes without upstreams to upstreams of pipeline
                            for upstream in next_node.upstreams:
                                for pipeline_node in next_node.nodes.values():
                                    if not pipeline_node.upstreams:
                                        next_node.add_dependency(
                                            upstream, pipeline_node)

                            # connect pipeline nodes without downstreams to downstream of pipeline
                            for downstream in next_node.downstreams:
                                for pipeline_node in next_node.nodes.values():
                                    if not pipeline_node.downstreams:
                                        next_node.add_dependency(
                                            pipeline_node, downstream)

                            # queue all child nodes
                            queue(list(next_node.nodes.values()))

                            # book keeping and event emission
                            pipeline_start_time = datetime.datetime.now()
                            running_pipelines[next_node] = [
                                pipeline_start_time, 0
                            ]
                            event_queue.put(
                                events.NodeStarted(next_node.path(),
                                                   pipeline_start_time, True))
                            event_queue.put(
                                events.Output(
                                    node_path=next_node.path(),
                                    format=logger.Format.ITALICS,
                                    message='★ ' + node_cost.format_duration(
                                        node_durations_and_run_times.get(
                                            tuple(next_node.path()),
                                            [0, 0])[0])))

                        elif isinstance(next_node, pipelines.ParallelTask):
                            # create sub tasks and queue them
                            try:
                                logger.redirect_output(event_queue,
                                                       next_node.path())
                                logger.log('☆ Launching tasks',
                                           format=logger.Format.ITALICS)
                                sub_pipeline = next_node.launch()
                                next_node.parent.replace(
                                    next_node, sub_pipeline)
                                queue([sub_pipeline])

                            except Exception as e:
                                logger.log(
                                    message=f'Could not launch parallel tasks',
                                    format=logger.Format.ITALICS,
                                    is_error=True)
                                logger.log(
                                    message=traceback.format_exc(),
                                    format=events.Output.Format.VERBATIM,
                                    is_error=True)
                                failed_pipelines.add(next_node.parent)
                            finally:
                                logger.redirect_output(event_queue,
                                                       pipeline.path())

                        else:
                            # run a task in a subprocess
                            if next_node.parent in running_pipelines:
                                running_pipelines[next_node.parent][1] += 1
                            event_queue.put(
                                events.NodeStarted(next_node.path(),
                                                   datetime.datetime.now(),
                                                   False))
                            event_queue.put(
                                events.Output(
                                    node_path=next_node.path(),
                                    format=logger.Format.ITALICS,
                                    message='★ ' + node_cost.format_duration(
                                        node_durations_and_run_times.get(
                                            tuple(next_node.path()),
                                            [0, 0])[0])))

                            status_queue = multiprocessing.Queue()
                            process = TaskProcess(next_node, event_queue,
                                                  status_queue)
                            process.start()
                            running_task_processes[next_node] = process

                # check whether some of the running processes finished
                for task_process in list(
                        running_task_processes.values()):  # type: TaskProcess
                    if task_process.is_alive():
                        pass
                    else:
                        del running_task_processes[task_process.task]
                        if task_process.task.parent in running_pipelines:
                            running_pipelines[task_process.task.parent][1] -= 1

                        processed_nodes.add(task_process.task)

                        succeeded = not (task_process.status_queue.get()
                                         == False
                                         or task_process.exitcode != 0)
                        if not succeeded:
                            for parent in task_process.task.parents()[:-1]:
                                failed_pipelines.add(parent)

                        end_time = datetime.datetime.now()
                        event_queue.put(
                            events.Output(
                                task_process.task.path(),
                                ('succeeded' if succeeded else 'failed') +
                                ',  ' + logger.format_time_difference(
                                    task_process.start_time, end_time),
                                format=logger.Format.ITALICS,
                                is_error=not succeeded))
                        event_queue.put(
                            events.NodeFinished(task_process.task.path(),
                                                task_process.start_time,
                                                end_time, False, succeeded))

                # check if some pipelines finished
                track_finished_pipelines()

                # don't busy-wait
                time.sleep(0.001)

        except:
            event_queue.put(
                events.Output(node_path=pipeline.path(),
                              message=traceback.format_exc(),
                              format=logger.Format.ITALICS,
                              is_error=True))

        # run again because `dequeue` might have moved more nodes to `finished_nodes`
        track_finished_pipelines()

        # kill the stats process (joining or terminating does not work in gunicorn)
        os.kill(statistics_process.pid, signal.SIGKILL)
        statistics_process.join()

        # run finished
        event_queue.put(
            events.RunFinished(node_path=pipeline.path(),
                               end_time=datetime.datetime.now(),
                               succeeded=not failed_pipelines))

    # fork the process and run `run`
    run_process = multiprocessing.Process(target=run,
                                          name='pipeline-' +
                                          '-'.join(pipeline.path()))
    run_process.start()

    # todo: make event handlers configurable (e.g. for slack)
    event_handlers = [run_log.RunLogger()]

    if config.slack_token():
        event_handlers.append(slack.Slack())

    # process messages from forked child processes
    while True:
        try:
            while not event_queue.empty():
                event = event_queue.get(False)
                for event_handler in event_handlers:
                    event_handler.handle_event(event)
                yield event
        except queues.Empty:
            pass
        except:
            yield events.Output(node_path=pipeline.path(),
                                message=traceback.format_exc(),
                                format=logger.Format.ITALICS,
                                is_error=True)
        if not run_process.is_alive():
            break
        time.sleep(0.001)
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        source_db = mara_db.dbs.db(self.source_db_alias)
        target_db = mara_db.dbs.db(self.target_db_alias)
        assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB))
        assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB))

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias) as cursor:
            pg_version = cursor.connection.server_version

        ddl_task = Task(
            id='create_tables_and_functions',
            description=
            'Re-creates the schema, tables structure and functions on the target db',
            commands=[
                # schema and table structure
                bash.RunBash(
                    command="(echo 'DROP SCHEMA IF EXISTS " +
                    self.schema_name + " CASCADE;';\\\n" +
                    "    pg_dump --username="******" --host=" +
                    source_db.host + " --schema=" + self.schema_name +
                    " --section=pre-data --no-owner --no-privileges " +
                    source_db.database + ") \\\n" + "  | " +
                    mara_db.shell.query_command(
                        self.target_db_alias, echo_queries=False) + ' --quiet'
                ),

                # function definitions
                bash.RunBash(command=f'''echo "
SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def 
FROM (SELECT oid, * 
      FROM pg_proc p 
      WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace
WHERE pg_proc.pronamespace = pg_namespace.oid
     AND nspname = '{self.schema_name}'" \\\n''' + "  | " +
                             mara_db.shell.copy_to_stdout_command(
                                 self.source_db_alias) + ' \\\n' + "  | " +
                             mara_db.shell.query_command(self.target_db_alias,
                                                         echo_queries=False))
            ])
        sub_pipeline.add(ddl_task)

        # copy content of tables
        number_of_chunks = self.max_number_of_parallel_tasks * 3
        table_copy_chunks = {i: [] for i in range(0, number_of_chunks)}
        current_size_per_table_copy_chunk = [0] * number_of_chunks
        table_types = {}

        with mara_db.postgresql.postgres_cursor_context(
                self.source_db_alias
        ) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(
                """
SELECT 
    pg_class.relname AS table,
    relkind,
    CASE WHEN relkind = 'f' 
         THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy 
         ELSE  pg_total_relation_size(pg_class.oid)
    END / 1000000.0 AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE nspname = '""" + self.schema_name +
                """' AND relkind IN ('r', 'f') AND relhassubclass = 'f'
ORDER BY size DESC""")
            for table_name, type, size in cursor.fetchall():
                smallest_chunk_index = min(
                    range(len(current_size_per_table_copy_chunk)),
                    key=current_size_per_table_copy_chunk.__getitem__)
                current_size_per_table_copy_chunk[smallest_chunk_index] += size
                table_copy_chunks[smallest_chunk_index].append(table_name)
                table_types[table_name] = type

            copy_tasks = []
            for i, tables in table_copy_chunks.items():
                if tables:
                    task = Task(
                        id=f'copy_tables_{i}',
                        description='Copies table content to the frontend db',
                        commands=[
                            RunBash(
                                command=
                                f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n'
                                + '  | ' +
                                mara_db.shell.copy_to_stdout_command(
                                    self.source_db_alias) + ' \\\n' + '  | ' +
                                mara_db.shell.copy_from_stdin_command(
                                    self.target_db_alias,
                                    target_table=
                                    f'{self.schema_name}.{table_name}'))
                            for table_name in tables
                        ])
                    copy_tasks.append(task)
                    sub_pipeline.add(task, upstreams=[ddl_task])

            # create indexes
            index_chunks = {i: [] for i in range(0, number_of_chunks)}
            current_size_per_index_chunk = [0] * number_of_chunks

            with mara_db.postgresql.postgres_cursor_context(
                    self.source_db_alias) as cursor:
                cursor.execute(""" 
SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname
WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i'
ORDER BY size DESC;""")
                for ddl, size in cursor.fetchall():
                    smallest_chunk_index = min(
                        range(len(current_size_per_index_chunk)),
                        key=current_size_per_index_chunk.__getitem__)
                    current_size_per_index_chunk[smallest_chunk_index] += size
                    index_chunks[smallest_chunk_index].append(ddl)

            for i, index_statements in index_chunks.items():
                if index_statements:
                    index_task = Task(
                        id=f'add_indexes_{i}',
                        description='Re-creates indexes on frontend db',
                        commands=[
                            ExecuteSQL(sql_statement=statement,
                                       db_alias=self.target_db_alias)
                            for statement in index_statements
                        ])
                    sub_pipeline.add(index_task, upstreams=copy_tasks)
def bc_reg_root_pipeline():
    import bcreg

    parent_pipeline = Pipeline(
        id = 'holder_for_pipeline_versions',
        description = 'Holder for the different versions of the BC Registries pipeline.')

    parent_pipeline.add(bc_reg_pipeline())
    parent_pipeline.add(bc_reg_pipeline_status())

    init_pipeline = Pipeline(
        id = 'initialization_and_load_tasks',
        description = 'One-time initialization and data load tasks')

    init_pipeline.add(db_init_pipeline())
    init_pipeline.add(bc_reg_pipeline_initial_load())
    init_pipeline.add(bc_reg_pipeline_post_credentials())

    parent_pipeline.add(init_pipeline)

    test_pipeline = Pipeline(
        id = 'test_and_demo_tasks',
        description = 'Holder for test and demo tasks.')

    test_pipeline.add(bc_init_test_data())
    test_pipeline.add(bc_reg_test_corps())
    test_pipeline.add(bc_reg_pipeline_single_thread())
    test_pipeline.add(bc_reg_pipeline_jsonbender())

    parent_pipeline.add(test_pipeline)

    return parent_pipeline
Example #28
0
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}'

        ddl = f'''
DROP TABLE IF EXISTS {attributes_table_name};

CREATE TABLE {attributes_table_name} (
    attribute TEXT NOT NULL, 
    value     TEXT NOT NULL, 
    row_count BIGINT NOT NULL
) PARTITION BY LIST (attribute);
'''

        commands = []

        with mara_db.postgresql.postgres_cursor_context(self.db_alias) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(f'''
WITH enums AS (
    SELECT DISTINCT
      typname,
      nspname
    FROM pg_type
      JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid
      JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid
  )
SELECT column_name
FROM information_schema.columns
  LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname
  WHERE table_schema = {'%s'}
      AND table_name = {'%s'}
      AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL);
''', (self.source_schema_name, self.source_table_name))

            i = 0

            for column_name, in cursor.fetchall():
                i += 1
                ddl += f"""
CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}');
"""
                commands.append(
                    ExecuteSQL(sql_statement=f'''
INSERT INTO {attributes_table_name}_{i} 
SELECT '{column_name}', "{column_name}", count(*)
FROM {self.source_schema_name}.{self.source_table_name}
WHERE "{column_name}" IS NOT NULL
GROUP BY "{column_name}"
ORDER BY "{column_name}";

CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value 
   ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops);
''', echo_queries=False))

        sub_pipeline.add_initial(
            Task(id='create_table', description='Creates the attributes table',
                 commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)]))

        chunk_size = math.ceil(len(commands) / (2 * data_integration.config.max_number_of_parallel_tasks()))
        for n, chunk in enumerate(more_itertools.chunked(commands, chunk_size)):
            task = Task(id=str(n), description='Process a portion of the attributes')
            task.add_commands(chunk)
            sub_pipeline.add(task)
import os
from data_integration.commands.bash import RunBash
from data_integration.commands.python import ExecutePython
from data_integration.pipelines import Pipeline, Task
from data_integration.ui.cli import run_pipeline, run_interactively
from data_integration.ui.cli import run_pipeline
import mara_db.auto_migration
import mara_db.config
import mara_db.dbs
import data_integration
from bcreg.bcreg_pipelines import db_init_pipeline

mara_host = os.environ.get('MARA_DB_HOST', 'bcregdb')
mara_database = os.environ.get('MARA_DB_DATABASE', 'mara_db')
mara_port = os.environ.get('MARA_DB_PORT', '5432')
mara_user = os.environ.get('MARA_DB_USER', 'mara_db')
mara_password = os.environ.get('MARA_DB_PASSWORD')

mara_db.config.databases \
    = lambda: {'mara': mara_db.dbs.PostgreSQLDB(user=mara_user, password=mara_password, host=mara_host, database=mara_database, port=mara_port)}

parent_pipeline = Pipeline(
    id = 'holder_for_pipeline_versions',
    description = 'Holder for the different versions of the BC Registries pipeline.')

parent_pipeline.add(db_init_pipeline())

run_pipeline(parent_pipeline)
Example #30
0
def von_data_pipeline_single_thread():
    import von_pipeline

    pipeline1 = Pipeline(
        id='von_data_pipeline_single_thread',
        description=
        'A pipeline that processes von_data events and generates credentials.')

    sub_pipeline1_2 = Pipeline(
        id='load_and_process_von_data_data_single_thread',
        description='Load von_data data and generate credentials')
    sub_pipeline1_2.add(
        Task(id='register_un_processed_events_single_thread',
             description='Register un-processed events',
             commands=[
                 ExecutePython('./von_pipeline/find-unprocessed-events.py')
             ]))
    sub_pipeline1_2.add(
        Task(id='load_von_data_data_single_thread',
             description='Load von_data data',
             commands=[
                 ExecutePython('./von_pipeline/register_un_processed_events')
             ]), ['register_un_processed_events_single_thread'])
    pipeline1.add(sub_pipeline1_2)

    sub_pipeline1_3 = Pipeline(
        id='submit_von_data_credentials_single_thread',
        description='Submit von_data credentials to P-X')
    sub_pipeline1_3.add(
        Task(id='submit_credentials_single_thread',
             description='Submit credentials',
             commands=[
                 ExecutePython('./von_pipeline/submit-creds-single-thread.py')
             ]))
    pipeline1.add(sub_pipeline1_3,
                  ['load_and_process_von_data_data_single_thread'])

    return pipeline1