def run_pipeline_and_notify(pipeline: pipelines.Pipeline, nodes: {pipelines.Node} = None): if config.slack_token(): import requests, os message = ( ':hatching_chick: *' + (os.environ.get('SUDO_USER') or os.environ.get('USER') or os.getlogin()) + '* manually triggered run of ' + ('pipeline <' + config.base_url() + '/' + '/'.join(pipeline.path()) + '|' + '/'.join(pipeline.path()) + ' >' if pipeline.parent else 'root pipeline')) if nodes: message += ', nodes ' + ', '.join( [f'`{node.id}`' for node in nodes]) requests.post('https://hooks.slack.com/services/' + config.slack_token(), json={'text': message}) if not run_pipeline(pipeline, nodes): requests.post('https://hooks.slack.com/services/' + config.slack_token(), json={'text': ':baby_chick: failed'}) sys.exit(-1) requests.post('https://hooks.slack.com/services/' + config.slack_token(), json={'text': ':hatched_chick: succeeded'})
def handle_event(self, event: events.Event): """ Send the output of a node to Slack when the node failed. Args: event: The current event of interest """ if isinstance(event, events.Output): key = tuple(event.node_path) if not self.node_output: self.node_output = {} if not key in self.node_output: self.node_output[key] = {True: [], False: []} self.node_output[key][event.is_error].append(event) elif isinstance(event, events.NodeFinished): key = tuple(event.node_path) if not event.succeeded and event.is_pipeline is False: message = { 'text': '\n:baby_chick: Ooops, a hiccup in ' + '_ <' + config.base_url() + flask.url_for('data_integration.node_page', path='/'.join(event.node_path)) + ' | ' + '/'.join(event.node_path) + ' > _', 'attachments': [] } if (self.node_output[key][False]): message['attachments'].append({ 'text': self.format_output(self.node_output[key][False]), 'mrkdwn_in': ['text'] }) if (self.node_output[key][True]): message['attachments'].append({ 'text': self.format_output(self.node_output[key][True]), 'color': '#eb4d5c', 'mrkdwn_in': ['text'] }) response = requests.post('https://hooks.slack.com/services/' + config.slack_token(), json=message) if response.status_code != 200: raise ValueError( 'Request to slack returned an error %s. The response is:\n%s' % (response.status_code, response.text))
def handle_event(self, event: events.Event): """ Send the output of a node to Slack when the node failed. Args: event: The current event of interest """ import requests if isinstance(event, pipeline_events.Output): key = tuple(event.node_path) if not self.node_output: self.node_output = {} if not key in self.node_output: self.node_output[key] = {True: [], False: []} self.node_output[key][event.is_error].append(event) elif isinstance(event, pipeline_events.NodeFinished): key = tuple(event.node_path) if not event.succeeded and event.is_pipeline is False: message = { 'text': '\n:baby_chick: Ooops, a hiccup in ' + '_ <' + config.base_url() + '/' + '/'.join(event.node_path) + ' | ' + '/'.join(event.node_path) + ' > _', 'attachments': [] } if (self.node_output[key][False]): message['attachments'].append({ 'text': self.format_output(self.node_output[key][False]), 'mrkdwn_in': ['text'] }) if (self.node_output[key][True]): message['attachments'].append({ 'text': self.format_output(self.node_output[key][True]), 'color': '#eb4d5c', 'mrkdwn_in': ['text'] }) response = requests.post('https://hooks.slack.com/services/' + config.slack_token(), json=message) if response.status_code != 200: raise ValueError( 'Request to slack returned an error %s. The response is:\n%s' % (response.status_code, response.text)) elif isinstance(event, pipeline_events.RunStarted): # default handler only handles interactively started runs if event.interactively_started: message = f':hatching_chick: *{event.user}* manually triggered run of ' message += ('pipeline <' + config.base_url() + '/' + '/'.join(event.node_path) + '|' + '/'.join(event.node_path) + ' >' if not event.is_root_pipeline else 'root pipeline') if event.node_ids: message += ', nodes ' + ', '.join( [f'`{id_}`' for id_ in event.node_ids]) requests.post('https://hooks.slack.com/services/' + config.slack_token(), json={'text': message}) elif isinstance(event, pipeline_events.RunFinished): # default handler only handles interactively started runs if event.interactively_started: if event.succeeded: msg = ':hatched_chick: succeeded' else: msg = ':baby_chick: failed' requests.post('https://hooks.slack.com/services/' + config.slack_token(), json={'text': msg})
def run_pipeline(pipeline: pipelines.Pipeline, nodes: {pipelines.Node} = None, with_upstreams: bool = False) -> [events.Event]: """ Runs a pipeline in a forked sub process. Acts as a generator that yields events from the sub process. Using forking has two advantages: 1. The pipeline is also forked and thus can be modified without affecting the original pipeline. 2. It's possible to hand over control to the parent process while the pipeline is running, for example for sending output to a browser. Args: pipeline: The pipeline to run nodes: A list of pipeline children that should run with_upstreams: When true and `nodes` are provided, then all upstreams of `nodes` in `pipeline` are also run Yields: Events emitted during pipeline execution """ # A queue for receiving events from forked sub processes event_queue = multiprocessing.Queue() # The function that is run in a sub process def run(): # collect system stats in a separate Process statistics_process = multiprocessing.Process( target=lambda: system_statistics.generate_system_statistics( event_queue), name='system_statistics') statistics_process.start() try: # capture output of print statements and other unplanned output logger.redirect_output(event_queue, pipeline.path()) # all nodes that have not run yet, ordered by priority node_queue: [pipelines.Node] = [] # data needed for computing cost node_durations_and_run_times = node_cost.node_durations_and_run_times( pipeline.path()) # Putting nodes into the node queue def queue(nodes: [pipelines.Node]): for node in nodes: node_cost.compute_cost(node, node_durations_and_run_times) node_queue.append(node) node_queue.sort(key=lambda node: node.cost, reverse=True) if nodes: # only run a set of child nodes def with_all_upstreams(nodes: {pipelines.Node}): """recursively find all upstreams of a list of nodes""" return functools.reduce( set.union, [with_all_upstreams(node.upstreams) for node in nodes], nodes) # when requested, include all upstreams of nodes, otherwise just use provided nodes nodes_to_run = with_all_upstreams( set(nodes)) if with_upstreams else set(nodes) # remove everything from pipeline that should not be run # (that's makes updating dependencies between nodes easier) for node in set(pipeline.nodes.values()) - nodes_to_run: pipeline.remove(node) # queue remaining nodes queue(list((pipeline.nodes).values())) else: # remove dependencies to siblings pipeline.upstreams = set() pipeline.downstreams = set() # queue whole pipeline queue([pipeline]) # book keeping run_start_time = datetime.datetime.now() # all nodes that already ran or that won't be run anymore processed_nodes: {pipelines.Node} = set() # running pipelines with start times and number of running children running_pipelines: { pipelines.Pipeline: [datetime.datetime, int] } = {} failed_pipelines: {pipelines.Pipeline } = set() # pipelines with failed tasks running_task_processes: {pipelines.Task: TaskProcess} = {} def dequeue() -> pipelines.Node: """ Finds the next task in the queue - without upstreams or where all upstreams have been run already - where the pipeline specific maximum number of parallel tasks per pipeline is not reached """ for node in node_queue: # type: pipelines.Node if ((not node.upstreams or len(node.upstreams & processed_nodes) == len( node.upstreams)) and (not isinstance(node.parent, pipelines.Pipeline) or (not node.parent.max_number_of_parallel_tasks) or (not node.parent in running_pipelines) or (running_pipelines[node.parent][1] < node.parent.max_number_of_parallel_tasks))): node_queue.remove(node) if node.parent in failed_pipelines: processed_nodes.add( node ) # if the parent pipeline failed, don't launch new nodes else: return node def track_finished_pipelines(): """when all nodes of a pipeline have been processed, then emit events""" for running_pipeline, (start_time, running_children) \ in dict(running_pipelines).items(): # type: pipelines.Pipeline if len( set(running_pipeline.nodes.values()) & processed_nodes) == len(running_pipeline.nodes): succeeded = running_pipeline not in failed_pipelines event_queue.put( events.Output( node_path=running_pipeline.path(), format=logger.Format.ITALICS, is_error=not succeeded, message= f'{"succeeded" if succeeded else "failed"}, {logger.format_time_difference(run_start_time, datetime.datetime.now())}' )) event_queue.put( events.NodeFinished( node_path=running_pipeline.path(), start_time=start_time, end_time=datetime.datetime.now(), is_pipeline=True, succeeded=succeeded)) del running_pipelines[running_pipeline] processed_nodes.add(running_pipeline) # announce run start event_queue.put( events.RunStarted(node_path=pipeline.path(), start_time=run_start_time, pid=os.getpid())) # run as long # - as task processes are still running # - as there is still stuff in the node queue while running_task_processes or node_queue: # don't do anything if the maximum number of parallel tasks is currently running if len(running_task_processes ) < config.max_number_of_parallel_tasks(): next_node = dequeue( ) # get the next runnable node from the queue if next_node: if isinstance(next_node, pipelines.Pipeline): # connect pipeline nodes without upstreams to upstreams of pipeline for upstream in next_node.upstreams: for pipeline_node in next_node.nodes.values(): if not pipeline_node.upstreams: next_node.add_dependency( upstream, pipeline_node) # connect pipeline nodes without downstreams to downstream of pipeline for downstream in next_node.downstreams: for pipeline_node in next_node.nodes.values(): if not pipeline_node.downstreams: next_node.add_dependency( pipeline_node, downstream) # queue all child nodes queue(list(next_node.nodes.values())) # book keeping and event emission pipeline_start_time = datetime.datetime.now() running_pipelines[next_node] = [ pipeline_start_time, 0 ] event_queue.put( events.NodeStarted(next_node.path(), pipeline_start_time, True)) event_queue.put( events.Output( node_path=next_node.path(), format=logger.Format.ITALICS, message='★ ' + node_cost.format_duration( node_durations_and_run_times.get( tuple(next_node.path()), [0, 0])[0]))) elif isinstance(next_node, pipelines.ParallelTask): # create sub tasks and queue them try: logger.redirect_output(event_queue, next_node.path()) logger.log('☆ Launching tasks', format=logger.Format.ITALICS) sub_pipeline = next_node.launch() next_node.parent.replace( next_node, sub_pipeline) queue([sub_pipeline]) except Exception as e: logger.log( message=f'Could not launch parallel tasks', format=logger.Format.ITALICS, is_error=True) logger.log( message=traceback.format_exc(), format=events.Output.Format.VERBATIM, is_error=True) failed_pipelines.add(next_node.parent) finally: logger.redirect_output(event_queue, pipeline.path()) else: # run a task in a subprocess if next_node.parent in running_pipelines: running_pipelines[next_node.parent][1] += 1 event_queue.put( events.NodeStarted(next_node.path(), datetime.datetime.now(), False)) event_queue.put( events.Output( node_path=next_node.path(), format=logger.Format.ITALICS, message='★ ' + node_cost.format_duration( node_durations_and_run_times.get( tuple(next_node.path()), [0, 0])[0]))) status_queue = multiprocessing.Queue() process = TaskProcess(next_node, event_queue, status_queue) process.start() running_task_processes[next_node] = process # check whether some of the running processes finished for task_process in list( running_task_processes.values()): # type: TaskProcess if task_process.is_alive(): pass else: del running_task_processes[task_process.task] if task_process.task.parent in running_pipelines: running_pipelines[task_process.task.parent][1] -= 1 processed_nodes.add(task_process.task) succeeded = not (task_process.status_queue.get() == False or task_process.exitcode != 0) if not succeeded: for parent in task_process.task.parents()[:-1]: failed_pipelines.add(parent) end_time = datetime.datetime.now() event_queue.put( events.Output( task_process.task.path(), ('succeeded' if succeeded else 'failed') + ', ' + logger.format_time_difference( task_process.start_time, end_time), format=logger.Format.ITALICS, is_error=not succeeded)) event_queue.put( events.NodeFinished(task_process.task.path(), task_process.start_time, end_time, False, succeeded)) # check if some pipelines finished track_finished_pipelines() # don't busy-wait time.sleep(0.001) except: event_queue.put( events.Output(node_path=pipeline.path(), message=traceback.format_exc(), format=logger.Format.ITALICS, is_error=True)) # run again because `dequeue` might have moved more nodes to `finished_nodes` track_finished_pipelines() # kill the stats process (joining or terminating does not work in gunicorn) os.kill(statistics_process.pid, signal.SIGKILL) statistics_process.join() # run finished event_queue.put( events.RunFinished(node_path=pipeline.path(), end_time=datetime.datetime.now(), succeeded=not failed_pipelines)) # fork the process and run `run` run_process = multiprocessing.Process(target=run, name='pipeline-' + '-'.join(pipeline.path())) run_process.start() # todo: make event handlers configurable (e.g. for slack) event_handlers = [run_log.RunLogger()] if config.slack_token(): event_handlers.append(slack.Slack()) # process messages from forked child processes while True: try: while not event_queue.empty(): event = event_queue.get(False) for event_handler in event_handlers: event_handler.handle_event(event) yield event except queues.Empty: pass except: yield events.Output(node_path=pipeline.path(), message=traceback.format_exc(), format=logger.Format.ITALICS, is_error=True) if not run_process.is_alive(): break time.sleep(0.001)