def __(pipeline: pipelines.Pipeline): return bootstrap.card( header_left='Nodes', header_right=[ bootstrap.button( id='run-with-upstreams-button', label='Run with upstreams', icon='play', url=flask.url_for('data_integration.run_page', path=pipeline.url_path(), with_upstreams=True), title= f'Run selected nodes with all their upstreams in pipeline "{pipeline.id}"' ), ' ', bootstrap.button(id='run-button', label='Run ', icon='play', url=flask.url_for('data_integration.run_page', path=pipeline.url_path(), with_upstreams=False), title='Run selected nodes') ] if config.allow_run_from_web_ui() else [], body=html.asynchronous_content( url=flask.url_for('data_integration.pipeline_children_table', path=pipeline.url_path())))
def bc_reg_pipeline_bn_credential_load(): import bcreg pipeline1 = Pipeline( id='bc_reg_bn_loader', description= 'A pipeline that creates BN credentials for all existing corporations.' ) sub_pipeline1_2 = Pipeline( id='load_existing_corps_no_bn', description='Load BC Reg corps with no BN credential') sub_pipeline1_2.add( Task(id='register_un_bned_corps', description='Register corps with no BN', commands=[ExecutePython('./bcreg/find-un-bned-corps.py')])) sub_pipeline1_2.add( Task(id='load_corp_bn_data', description='Load BN credentials from company data', commands=[ ExecutePython('./bcreg/process-corps-generate-bn-creds.py') ]), ['register_un_bned_corps']) pipeline1.add(sub_pipeline1_2) return pipeline1
def run_pipeline_and_notify(pipeline: pipelines.Pipeline, nodes: {pipelines.Node} = None): if config.slack_token(): import requests, os message = ( ':hatching_chick: *' + (os.environ.get('SUDO_USER') or os.environ.get('USER') or os.getlogin()) + '* manually triggered run of ' + ('pipeline <' + config.base_url() + '/' + '/'.join(pipeline.path()) + '|' + '/'.join(pipeline.path()) + ' >' if pipeline.parent else 'root pipeline')) if nodes: message += ', nodes ' + ', '.join( [f'`{node.id}`' for node in nodes]) requests.post('https://hooks.slack.com/services/' + config.slack_token(), json={'text': message}) if not run_pipeline(pipeline, nodes): requests.post('https://hooks.slack.com/services/' + config.slack_token(), json={'text': ':baby_chick: failed'}) sys.exit(-1) requests.post('https://hooks.slack.com/services/' + config.slack_token(), json={'text': ':hatched_chick: succeeded'})
def bc_reg_pipeline_initial_load(): import bcreg pipeline1 = Pipeline( id='bc_reg_corp_loader', description= 'A pipeline that does the initial data load and credentials for all corporations.' ) sub_pipeline1_2 = Pipeline( id='load_and_process_bc_reg_corps', description='Load Active BC Reg corps and generate credentials') sub_pipeline1_2.add( Task(id='register_un_processed_corps', description='Register un-processed active corps', commands=[ ExecutePython('./bcreg/find-unprocessed-corps_actve.py') ])) sub_pipeline1_2.add( Task(id='load_bc_reg_data_a', description='Load BC Registries data', commands=[ ExecutePython('./bcreg/process-corps-generate-creds.py') ]), ['register_un_processed_corps']) pipeline1.add(sub_pipeline1_2) return pipeline1
def bc_reg_pipeline_status(): import bcreg pipeline = Pipeline( id='bc_reg_pipeline_status', description='Display overall event processing status.') pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status', commands=[ExecutePython('./bcreg/display_pipeline_status.py')])) return pipeline
def bc_reg_populate_audit_table(): import bcreg pipeline = Pipeline( id='bc_reg_populate_audit_table', description='Populate Audit Table.') pipeline.add(Task(id='populate_audit_table', description='Populate audit table.', commands=[ExecutePython('./bcreg/populate_audit_table.py')])) return pipeline
def bc_init_test_data(): import bcreg pipeline = Pipeline( id='bc_reg_test_data', description='A pipeline that initializes event processor database for testing.') pipeline.add(Task(id='register_test_corps', description='Insert some test data for processing', commands=[ExecutePython('./bcreg/insert-test.py')])) return pipeline
def bc_reg_test_corps(): import bcreg pipeline = Pipeline( id='bc_reg_test_corps', description='A pipeline that queues up a small set of test corporations.') pipeline.add(Task(id='register_test_corps', description='Register some test corps for processing', commands=[ExecutePython('./bcreg/find-test-corps.py')])) return pipeline
def von_list_mongo_data(): import von_pipeline pipeline = Pipeline(id='von_list_mongo_data', description='A pipeline that lists data in mongodb.') pipeline.add( Task(id='list_mongo_data', description='List data queued for processing', commands=[ExecutePython('./von_pipeline/list_mongo_data.py')])) return pipeline
def euro_exchange_rates_pipeline(db_alias: str): pipeline = Pipeline( id="load_euro_exchange_rates", description= "Loads daily Euro exchange rates since 1999 from the European central bank", base_path=pathlib.Path(__file__).parent) pipeline.add( Task(id="create_schema_and_table", description="Re-creates currency exchange rate schema", commands=[ ExecuteSQL(sql_file_name='create_schema_and_table.sql', echo_queries=False) ])) pipeline.add(Task( id='load_exchange_rate', description='Loads exchange rates from the European central bank', commands=[ ReadScriptOutput(file_name='load_exchange_rate.py', target_table='euro_fx.exchange_rate', db_alias='mdwh-etl') ]), upstreams=['create_schema_and_table']) pipeline.add(Task(id="postprocess_exchange_rate", description="Adds values for missing days", commands=[ ExecuteSQL( sql_file_name='postprocess_exchange_rate.sql', echo_queries=False) ]), upstreams=['load_exchange_rate']) return pipeline
def von_data_pipeline_status(): import von_pipeline pipeline = Pipeline(id='von_data_pipeline_status', description='Display overall event processing status.') pipeline.add( Task(id='display_pipeline_status', description= 'Display status of the overall pipeline processing status', commands=[ ExecutePython('./von_pipeline/display_pipeline_status.py') ])) return pipeline
def __(pipeline: pipelines.Pipeline): return [ response.ActionButton(action=flask.url_for('data_integration.run_page', path=pipeline.url_path(), with_upstreams=False), label='Run', icon='play', title='Run the pipeline') ]
def bc_reg_pipeline_post_credentials(): import bcreg pipeline1 = Pipeline( id='bc_reg_credential_poster', description='A pipeline that posts generated credentials to TOB.') sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a', description='Submit BC Reg credentials to P-X') sub_pipeline1_3.add(Task(id='submit_credentials_a', description='Submit credentials', commands=[ExecutePython('./bcreg/submit-creds.py')])) pipeline1.add(sub_pipeline1_3) return pipeline1
def root_pipeline(): import app.data_integration.pipelines.github import app.data_integration.pipelines.pypi import app.data_integration.pipelines.utils import app.data_integration.pipelines.python_projects pipeline = Pipeline( id='mara_example_project', description= 'An example pipeline that integrates PyPI download stats with the Github activity of a project' ) pipeline.add(app.data_integration.pipelines.utils.pipeline) pipeline.add(app.data_integration.pipelines.pypi.pipeline, upstreams=['utils']) pipeline.add(app.data_integration.pipelines.github.pipeline, upstreams=['utils']) pipeline.add(app.data_integration.pipelines.python_projects.pipeline, upstreams=['pypi', 'github']) return pipeline
def add_schema_copying_to_pipeline(pipeline: Pipeline, schema_name, source_db_alias: str, target_db_alias: str, max_number_of_parallel_tasks: int = 4): """ Adds schema copying to the end of a pipeline. When the pipeline already has a final node, then the all except the last command are run before the copying, and the last command after. Args: pipeline: The pipeline to modify schema_name: The schema to copy source_db_alias: The alias of the PostgreSQL database to copy from target_db_alias: The alias of the PostgreSQL database to copy to max_number_of_parallel_tasks: How many operations to run at parallel at max. """ task_id = "copy_schema" description = f"Copies the {schema_name} schema to the {target_db_alias} db" commands = [] if pipeline.final_node: assert (isinstance(pipeline.final_node, Task)) description = pipeline.final_node.description + ' + ' + description task_id = pipeline.final_node.id + '_and_' + task_id commands = pipeline.final_node.commands pipeline.remove(pipeline.final_node) pipeline.add_final( ParallelCopySchema( id=task_id, description=description, schema_name=schema_name, source_db_alias=source_db_alias, target_db_alias=target_db_alias, max_number_of_parallel_tasks=max_number_of_parallel_tasks, commands_before=commands[:-1], commands_after=commands[-1:]))
def von_data_pipeline_post_credentials(): import von_pipeline pipeline1 = Pipeline( id='von_data_credential_poster', description='A pipeline that posts generated credentials to TOB.') sub_pipeline1_3 = Pipeline( id='submit_von_data_credentials_a', description='Submit von_data credentials to P-X') sub_pipeline1_3.add( Task(id='submit_credentials_a', description='Submit credentials', commands=[ExecutePython('./von_pipeline/submit-creds.py')])) pipeline1.add(sub_pipeline1_3) return pipeline1
def bc_reg_pipeline_post_credentials(): import bcreg pipeline1 = Pipeline( id='bc_reg_credential_poster', description='A pipeline that posts generated credentials to TOB.') sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials_a', description='Submit BC Reg credentials to P-X') sub_pipeline1_3.add(Task(id='submit_credentials_a', description='Submit credentials', commands=[ExecutePython('./bcreg/submit-creds.py')])) pipeline1.add(sub_pipeline1_3) # sub_pipeline1_4 = Pipeline(id='populate_evp_audit_table_a', description='Populate Event Processor Audit Table') # sub_pipeline1_4.add(Task(id='populate_audit_table_a', description='Populate Audit Table', # commands=[ExecutePython('./bcreg/populate_audit_table.py')])) # pipeline1.add(sub_pipeline1_4, ['submit_bc_reg_credentials_a']) return pipeline1
def db_init_pipeline(): import bcreg pipeline = Pipeline( id = 'bc_reg_db_init', description = 'Initialize BC Registries Event Processor database') pipeline.add(Task(id='create_tables', description='Create event processing tables', commands=[ExecutePython('./bcreg/create.py')])) pipeline.add(Task(id='initialize_tables', description='Insert configuration data', commands=[ExecutePython('./bcreg/insert.py')]), ['create_tables']) return pipeline
def utils_pipeline(with_hll=False, with_cstore_fdw=False): pipeline = Pipeline( id="initialize_utils", description= "Creates an utils schema with a number of functions around the ETL best practices of Project A", base_path=pathlib.Path(__file__).parent) pipeline.add_initial( Task(id="create_utils_schema", description="Re-creates the utils schema", commands=[ ExecuteSQL( sql_statement= "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;") ])) pipeline.add( Task(id='chunking', description='Runs file chunking.sql', commands=[ ExecuteSQL(sql_file_name='chunking.sql', echo_queries=False, replace={ 'number_of_chunks': lambda: config.number_of_chunks() }) ])) def add_task_for_file(file_name_without_extension): pipeline.add( Task(id=file_name_without_extension, description=f'Runs file "{file_name_without_extension}.sql"', commands=[ ExecuteSQL(sql_file_name=file_name_without_extension + '.sql', echo_queries=False) ])) for file_name_without_extension in [ 'consistency_checks', 'data_sets', 'partitioning', 'indexes_and_constraints', 'schema_switching', 'enums' ]: add_task_for_file(file_name_without_extension) if with_hll: add_task_for_file('hll') if with_cstore_fdw: add_task_for_file('cstore_fdw') return pipeline
def bc_reg_pipeline_status(): import bcreg pipeline = Pipeline( id='bc_reg_pipeline_status', description='Display overall event processing status.') pipeline.add(Task(id='display_pipeline_status', description='Display status of the overall pipeline processing status', commands=[ExecutePython('./bcreg/display_pipeline_status.py')])) # remove these from the pipeline due to issues connecting to DB's on openshift #pipeline.add(Task(id='display_pipeline_stats', description='Display stats of each stage in the pipeline processing', # commands=[ExecutePython('./bcreg/display_processed_corps_counts.py')])) pipeline.add(Task(id='display_event_processor_stats', description='Display stats of each event processor stage', commands=[ExecutePython('./bcreg/display_event_processor_counts.py')])) return pipeline
def db_init_pipeline(): import von_pipeline pipeline = Pipeline( id='von_data_db_init', description='Initialize von_data Event Processor database') pipeline.add( Task(id='create_tables', description='Create event processing tables', commands=[ExecutePython('./von_pipeline/create.py')])) pipeline.add( Task(id='initialize_tables', description='Insert configuration data', commands=[ExecutePython('./von_pipeline/insert.py')]), ['create_tables']) return pipeline
def von_root_pipeline(): parent_pipeline = Pipeline( id='holder_for_pipeline_versions', description= 'Holder for the different versions of the VON Data Pipeline.') parent_pipeline.add(von_data_pipeline()) parent_pipeline.add(von_data_pipeline_status()) init_pipeline = Pipeline( id='initialization_and_load_tasks', description='One-time initialization and data load tasks') init_pipeline.add(db_init_pipeline()) init_pipeline.add(von_data_pipeline_initial_load()) init_pipeline.add(von_data_pipeline_post_credentials()) parent_pipeline.add(init_pipeline) test_pipeline = Pipeline(id='test_and_demo_tasks', description='Holder for test and demo tasks.') test_pipeline.add(von_data_init_test_data()) test_pipeline.add(von_data_test_registrations()) test_pipeline.add(von_data_pipeline_single_thread()) parent_pipeline.add(test_pipeline) return parent_pipeline
def bc_reg_pipeline(): import bcreg pipeline1 = Pipeline( id='bc_reg_event_processor', description= 'A pipeline that processes BC Registries events and generates credentials.' ) sub_pipeline1_2 = Pipeline( id='load_and_process_bc_reg_data', description='Load BC Reg data and generate credentials') sub_pipeline1_2.add( Task(id='register_un_processed_events', description='Register un-processed events', commands=[ExecutePython('./bcreg/find-unprocessed-events.py')])) sub_pipeline1_2.add( Task(id='load_bc_reg_data', description='Load BC Registries data', commands=[ExecutePython('./bcreg/process-corps.py')]), ['register_un_processed_events']) sub_pipeline1_2.add( Task(id='create_bc_reg_credentials', description='Create credentials', commands=[ExecutePython('./bcreg/generate-creds.py')]), ['load_bc_reg_data']) pipeline1.add(sub_pipeline1_2) sub_pipeline1_3 = Pipeline(id='submit_bc_reg_credentials', description='Submit BC Reg credentials to P-X') sub_pipeline1_3.add( Task(id='submit_credentials', description='Submit credentials', commands=[ExecutePython('./bcreg/submit-creds.py')])) pipeline1.add(sub_pipeline1_3, ['load_and_process_bc_reg_data']) return pipeline1
def bc_reg_pipeline_jsonbender(): import bcreg pipeline2 = Pipeline( id='bc_reg_event_processor_json_transform_demo', description= 'A demo pipeline that processes events and generates credentials using JSONBender.' ) sub_pipeline2_2 = Pipeline( id='load_and_process_bc_reg_data', description='Load BC Reg data and generate credentials') sub_pipeline2_2.add( Task(id='register_un_processed_events', description='Register un-processed events', commands=[ExecutePython('./bcreg/find-unprocessed-events.py')])) sub_pipeline2_2.add( Task(id='load_bc_reg_data', description='Load BC Registries data', commands=[ExecutePython('./bcreg/process-corps.py')]), ['register_un_processed_events']) sub_pipeline2_2.add( Task(id='create_credentials_jsonbender', description='Create credentials using JSONBender transform', commands=[ExecutePython('./bcreg/generate-creds-bender.py')]), ['load_bc_reg_data']) pipeline2.add(sub_pipeline2_2) sub_pipeline2_3 = Pipeline(id='submit_bc_reg_credentials', description='Submit BC Reg credentials to P-X') sub_pipeline2_3.add( Task(id='submit_credentials', description='Submit credentials', commands=[ExecutePython('./bcreg/submit-creds.py')])) pipeline2.add(sub_pipeline2_3, ['load_and_process_bc_reg_data']) return pipeline2
def run_pipeline(pipeline: pipelines.Pipeline, nodes: {pipelines.Node} = None, with_upstreams: bool = False) -> [events.Event]: """ Runs a pipeline in a forked sub process. Acts as a generator that yields events from the sub process. Using forking has two advantages: 1. The pipeline is also forked and thus can be modified without affecting the original pipeline. 2. It's possible to hand over control to the parent process while the pipeline is running, for example for sending output to a browser. Args: pipeline: The pipeline to run nodes: A list of pipeline children that should run with_upstreams: When true and `nodes` are provided, then all upstreams of `nodes` in `pipeline` are also run Yields: Events emitted during pipeline execution """ # A queue for receiving events from forked sub processes event_queue = multiprocessing.Queue() # The function that is run in a sub process def run(): # collect system stats in a separate Process statistics_process = multiprocessing.Process( target=lambda: system_statistics.generate_system_statistics( event_queue), name='system_statistics') statistics_process.start() try: # capture output of print statements and other unplanned output logger.redirect_output(event_queue, pipeline.path()) # all nodes that have not run yet, ordered by priority node_queue: [pipelines.Node] = [] # data needed for computing cost node_durations_and_run_times = node_cost.node_durations_and_run_times( pipeline.path()) # Putting nodes into the node queue def queue(nodes: [pipelines.Node]): for node in nodes: node_cost.compute_cost(node, node_durations_and_run_times) node_queue.append(node) node_queue.sort(key=lambda node: node.cost, reverse=True) if nodes: # only run a set of child nodes def with_all_upstreams(nodes: {pipelines.Node}): """recursively find all upstreams of a list of nodes""" return functools.reduce( set.union, [with_all_upstreams(node.upstreams) for node in nodes], nodes) # when requested, include all upstreams of nodes, otherwise just use provided nodes nodes_to_run = with_all_upstreams( set(nodes)) if with_upstreams else set(nodes) # remove everything from pipeline that should not be run # (that's makes updating dependencies between nodes easier) for node in set(pipeline.nodes.values()) - nodes_to_run: pipeline.remove(node) # queue remaining nodes queue(list((pipeline.nodes).values())) else: # remove dependencies to siblings pipeline.upstreams = set() pipeline.downstreams = set() # queue whole pipeline queue([pipeline]) # book keeping run_start_time = datetime.datetime.now() # all nodes that already ran or that won't be run anymore processed_nodes: {pipelines.Node} = set() # running pipelines with start times and number of running children running_pipelines: { pipelines.Pipeline: [datetime.datetime, int] } = {} failed_pipelines: {pipelines.Pipeline } = set() # pipelines with failed tasks running_task_processes: {pipelines.Task: TaskProcess} = {} def dequeue() -> pipelines.Node: """ Finds the next task in the queue - without upstreams or where all upstreams have been run already - where the pipeline specific maximum number of parallel tasks per pipeline is not reached """ for node in node_queue: # type: pipelines.Node if ((not node.upstreams or len(node.upstreams & processed_nodes) == len( node.upstreams)) and (not isinstance(node.parent, pipelines.Pipeline) or (not node.parent.max_number_of_parallel_tasks) or (not node.parent in running_pipelines) or (running_pipelines[node.parent][1] < node.parent.max_number_of_parallel_tasks))): node_queue.remove(node) if node.parent in failed_pipelines: processed_nodes.add( node ) # if the parent pipeline failed, don't launch new nodes else: return node def track_finished_pipelines(): """when all nodes of a pipeline have been processed, then emit events""" for running_pipeline, (start_time, running_children) \ in dict(running_pipelines).items(): # type: pipelines.Pipeline if len( set(running_pipeline.nodes.values()) & processed_nodes) == len(running_pipeline.nodes): succeeded = running_pipeline not in failed_pipelines event_queue.put( events.Output( node_path=running_pipeline.path(), format=logger.Format.ITALICS, is_error=not succeeded, message= f'{"succeeded" if succeeded else "failed"}, {logger.format_time_difference(run_start_time, datetime.datetime.now())}' )) event_queue.put( events.NodeFinished( node_path=running_pipeline.path(), start_time=start_time, end_time=datetime.datetime.now(), is_pipeline=True, succeeded=succeeded)) del running_pipelines[running_pipeline] processed_nodes.add(running_pipeline) # announce run start event_queue.put( events.RunStarted(node_path=pipeline.path(), start_time=run_start_time, pid=os.getpid())) # run as long # - as task processes are still running # - as there is still stuff in the node queue while running_task_processes or node_queue: # don't do anything if the maximum number of parallel tasks is currently running if len(running_task_processes ) < config.max_number_of_parallel_tasks(): next_node = dequeue( ) # get the next runnable node from the queue if next_node: if isinstance(next_node, pipelines.Pipeline): # connect pipeline nodes without upstreams to upstreams of pipeline for upstream in next_node.upstreams: for pipeline_node in next_node.nodes.values(): if not pipeline_node.upstreams: next_node.add_dependency( upstream, pipeline_node) # connect pipeline nodes without downstreams to downstream of pipeline for downstream in next_node.downstreams: for pipeline_node in next_node.nodes.values(): if not pipeline_node.downstreams: next_node.add_dependency( pipeline_node, downstream) # queue all child nodes queue(list(next_node.nodes.values())) # book keeping and event emission pipeline_start_time = datetime.datetime.now() running_pipelines[next_node] = [ pipeline_start_time, 0 ] event_queue.put( events.NodeStarted(next_node.path(), pipeline_start_time, True)) event_queue.put( events.Output( node_path=next_node.path(), format=logger.Format.ITALICS, message='★ ' + node_cost.format_duration( node_durations_and_run_times.get( tuple(next_node.path()), [0, 0])[0]))) elif isinstance(next_node, pipelines.ParallelTask): # create sub tasks and queue them try: logger.redirect_output(event_queue, next_node.path()) logger.log('☆ Launching tasks', format=logger.Format.ITALICS) sub_pipeline = next_node.launch() next_node.parent.replace( next_node, sub_pipeline) queue([sub_pipeline]) except Exception as e: logger.log( message=f'Could not launch parallel tasks', format=logger.Format.ITALICS, is_error=True) logger.log( message=traceback.format_exc(), format=events.Output.Format.VERBATIM, is_error=True) failed_pipelines.add(next_node.parent) finally: logger.redirect_output(event_queue, pipeline.path()) else: # run a task in a subprocess if next_node.parent in running_pipelines: running_pipelines[next_node.parent][1] += 1 event_queue.put( events.NodeStarted(next_node.path(), datetime.datetime.now(), False)) event_queue.put( events.Output( node_path=next_node.path(), format=logger.Format.ITALICS, message='★ ' + node_cost.format_duration( node_durations_and_run_times.get( tuple(next_node.path()), [0, 0])[0]))) status_queue = multiprocessing.Queue() process = TaskProcess(next_node, event_queue, status_queue) process.start() running_task_processes[next_node] = process # check whether some of the running processes finished for task_process in list( running_task_processes.values()): # type: TaskProcess if task_process.is_alive(): pass else: del running_task_processes[task_process.task] if task_process.task.parent in running_pipelines: running_pipelines[task_process.task.parent][1] -= 1 processed_nodes.add(task_process.task) succeeded = not (task_process.status_queue.get() == False or task_process.exitcode != 0) if not succeeded: for parent in task_process.task.parents()[:-1]: failed_pipelines.add(parent) end_time = datetime.datetime.now() event_queue.put( events.Output( task_process.task.path(), ('succeeded' if succeeded else 'failed') + ', ' + logger.format_time_difference( task_process.start_time, end_time), format=logger.Format.ITALICS, is_error=not succeeded)) event_queue.put( events.NodeFinished(task_process.task.path(), task_process.start_time, end_time, False, succeeded)) # check if some pipelines finished track_finished_pipelines() # don't busy-wait time.sleep(0.001) except: event_queue.put( events.Output(node_path=pipeline.path(), message=traceback.format_exc(), format=logger.Format.ITALICS, is_error=True)) # run again because `dequeue` might have moved more nodes to `finished_nodes` track_finished_pipelines() # kill the stats process (joining or terminating does not work in gunicorn) os.kill(statistics_process.pid, signal.SIGKILL) statistics_process.join() # run finished event_queue.put( events.RunFinished(node_path=pipeline.path(), end_time=datetime.datetime.now(), succeeded=not failed_pipelines)) # fork the process and run `run` run_process = multiprocessing.Process(target=run, name='pipeline-' + '-'.join(pipeline.path())) run_process.start() # todo: make event handlers configurable (e.g. for slack) event_handlers = [run_log.RunLogger()] if config.slack_token(): event_handlers.append(slack.Slack()) # process messages from forked child processes while True: try: while not event_queue.empty(): event = event_queue.get(False) for event_handler in event_handlers: event_handler.handle_event(event) yield event except queues.Empty: pass except: yield events.Output(node_path=pipeline.path(), message=traceback.format_exc(), format=logger.Format.ITALICS, is_error=True) if not run_process.is_alive(): break time.sleep(0.001)
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: source_db = mara_db.dbs.db(self.source_db_alias) target_db = mara_db.dbs.db(self.target_db_alias) assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB)) assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB)) with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: pg_version = cursor.connection.server_version ddl_task = Task( id='create_tables_and_functions', description= 'Re-creates the schema, tables structure and functions on the target db', commands=[ # schema and table structure bash.RunBash( command="(echo 'DROP SCHEMA IF EXISTS " + self.schema_name + " CASCADE;';\\\n" + " pg_dump --username="******" --host=" + source_db.host + " --schema=" + self.schema_name + " --section=pre-data --no-owner --no-privileges " + source_db.database + ") \\\n" + " | " + mara_db.shell.query_command( self.target_db_alias, echo_queries=False) + ' --quiet' ), # function definitions bash.RunBash(command=f'''echo " SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def FROM (SELECT oid, * FROM pg_proc p WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace WHERE pg_proc.pronamespace = pg_namespace.oid AND nspname = '{self.schema_name}'" \\\n''' + " | " + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + " | " + mara_db.shell.query_command(self.target_db_alias, echo_queries=False)) ]) sub_pipeline.add(ddl_task) # copy content of tables number_of_chunks = self.max_number_of_parallel_tasks * 3 table_copy_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_table_copy_chunk = [0] * number_of_chunks table_types = {} with mara_db.postgresql.postgres_cursor_context( self.source_db_alias ) as cursor: # type: psycopg2.extensions.cursor cursor.execute( """ SELECT pg_class.relname AS table, relkind, CASE WHEN relkind = 'f' THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy ELSE pg_total_relation_size(pg_class.oid) END / 1000000.0 AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace WHERE nspname = '""" + self.schema_name + """' AND relkind IN ('r', 'f') AND relhassubclass = 'f' ORDER BY size DESC""") for table_name, type, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_table_copy_chunk)), key=current_size_per_table_copy_chunk.__getitem__) current_size_per_table_copy_chunk[smallest_chunk_index] += size table_copy_chunks[smallest_chunk_index].append(table_name) table_types[table_name] = type copy_tasks = [] for i, tables in table_copy_chunks.items(): if tables: task = Task( id=f'copy_tables_{i}', description='Copies table content to the frontend db', commands=[ RunBash( command= f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n' + ' | ' + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + ' | ' + mara_db.shell.copy_from_stdin_command( self.target_db_alias, target_table= f'{self.schema_name}.{table_name}')) for table_name in tables ]) copy_tasks.append(task) sub_pipeline.add(task, upstreams=[ddl_task]) # create indexes index_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_index_chunk = [0] * number_of_chunks with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: cursor.execute(""" SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i' ORDER BY size DESC;""") for ddl, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_index_chunk)), key=current_size_per_index_chunk.__getitem__) current_size_per_index_chunk[smallest_chunk_index] += size index_chunks[smallest_chunk_index].append(ddl) for i, index_statements in index_chunks.items(): if index_statements: index_task = Task( id=f'add_indexes_{i}', description='Re-creates indexes on frontend db', commands=[ ExecuteSQL(sql_statement=statement, db_alias=self.target_db_alias) for statement in index_statements ]) sub_pipeline.add(index_task, upstreams=copy_tasks)
def bc_reg_root_pipeline(): import bcreg parent_pipeline = Pipeline( id = 'holder_for_pipeline_versions', description = 'Holder for the different versions of the BC Registries pipeline.') parent_pipeline.add(bc_reg_pipeline()) parent_pipeline.add(bc_reg_pipeline_status()) init_pipeline = Pipeline( id = 'initialization_and_load_tasks', description = 'One-time initialization and data load tasks') init_pipeline.add(db_init_pipeline()) init_pipeline.add(bc_reg_pipeline_initial_load()) init_pipeline.add(bc_reg_pipeline_post_credentials()) parent_pipeline.add(init_pipeline) test_pipeline = Pipeline( id = 'test_and_demo_tasks', description = 'Holder for test and demo tasks.') test_pipeline.add(bc_init_test_data()) test_pipeline.add(bc_reg_test_corps()) test_pipeline.add(bc_reg_pipeline_single_thread()) test_pipeline.add(bc_reg_pipeline_jsonbender()) parent_pipeline.add(test_pipeline) return parent_pipeline
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}' ddl = f''' DROP TABLE IF EXISTS {attributes_table_name}; CREATE TABLE {attributes_table_name} ( attribute TEXT NOT NULL, value TEXT NOT NULL, row_count BIGINT NOT NULL ) PARTITION BY LIST (attribute); ''' commands = [] with mara_db.postgresql.postgres_cursor_context(self.db_alias) as cursor: # type: psycopg2.extensions.cursor cursor.execute(f''' WITH enums AS ( SELECT DISTINCT typname, nspname FROM pg_type JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid ) SELECT column_name FROM information_schema.columns LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname WHERE table_schema = {'%s'} AND table_name = {'%s'} AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL); ''', (self.source_schema_name, self.source_table_name)) i = 0 for column_name, in cursor.fetchall(): i += 1 ddl += f""" CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}'); """ commands.append( ExecuteSQL(sql_statement=f''' INSERT INTO {attributes_table_name}_{i} SELECT '{column_name}', "{column_name}", count(*) FROM {self.source_schema_name}.{self.source_table_name} WHERE "{column_name}" IS NOT NULL GROUP BY "{column_name}" ORDER BY "{column_name}"; CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops); ''', echo_queries=False)) sub_pipeline.add_initial( Task(id='create_table', description='Creates the attributes table', commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)])) chunk_size = math.ceil(len(commands) / (2 * data_integration.config.max_number_of_parallel_tasks())) for n, chunk in enumerate(more_itertools.chunked(commands, chunk_size)): task = Task(id=str(n), description='Process a portion of the attributes') task.add_commands(chunk) sub_pipeline.add(task)
import os from data_integration.commands.bash import RunBash from data_integration.commands.python import ExecutePython from data_integration.pipelines import Pipeline, Task from data_integration.ui.cli import run_pipeline, run_interactively from data_integration.ui.cli import run_pipeline import mara_db.auto_migration import mara_db.config import mara_db.dbs import data_integration from bcreg.bcreg_pipelines import db_init_pipeline mara_host = os.environ.get('MARA_DB_HOST', 'bcregdb') mara_database = os.environ.get('MARA_DB_DATABASE', 'mara_db') mara_port = os.environ.get('MARA_DB_PORT', '5432') mara_user = os.environ.get('MARA_DB_USER', 'mara_db') mara_password = os.environ.get('MARA_DB_PASSWORD') mara_db.config.databases \ = lambda: {'mara': mara_db.dbs.PostgreSQLDB(user=mara_user, password=mara_password, host=mara_host, database=mara_database, port=mara_port)} parent_pipeline = Pipeline( id = 'holder_for_pipeline_versions', description = 'Holder for the different versions of the BC Registries pipeline.') parent_pipeline.add(db_init_pipeline()) run_pipeline(parent_pipeline)
def von_data_pipeline_single_thread(): import von_pipeline pipeline1 = Pipeline( id='von_data_pipeline_single_thread', description= 'A pipeline that processes von_data events and generates credentials.') sub_pipeline1_2 = Pipeline( id='load_and_process_von_data_data_single_thread', description='Load von_data data and generate credentials') sub_pipeline1_2.add( Task(id='register_un_processed_events_single_thread', description='Register un-processed events', commands=[ ExecutePython('./von_pipeline/find-unprocessed-events.py') ])) sub_pipeline1_2.add( Task(id='load_von_data_data_single_thread', description='Load von_data data', commands=[ ExecutePython('./von_pipeline/register_un_processed_events') ]), ['register_un_processed_events_single_thread']) pipeline1.add(sub_pipeline1_2) sub_pipeline1_3 = Pipeline( id='submit_von_data_credentials_single_thread', description='Submit von_data credentials to P-X') sub_pipeline1_3.add( Task(id='submit_credentials_single_thread', description='Submit credentials', commands=[ ExecutePython('./von_pipeline/submit-creds-single-thread.py') ])) pipeline1.add(sub_pipeline1_3, ['load_and_process_von_data_data_single_thread']) return pipeline1