Esempio n. 1
0
    def check_ping_files(self, print_more_warnings = False,
                         print_details = False, fix_problems = False):
        run_problems = list()
        queue_problems = list()
        check_queue = True
        
        try:
            stat_output = subprocess.check_output([self.cc('stat')], 
                                                  stderr = subprocess.STDOUT)
        except (KeyError, OSError, subprocess.CalledProcessError):
            # we don't have a stat tool here, if subprocess.CalledProcessError
            # is raised
            check_queue = False
            
        if print_more_warnings and not check_queue:
            print("Attention, we cannot check stale queued ping files because "
                  "this host does not have %s." % self.cc('stat'))
            
        running_jids = set()
        
        if check_queue:
            for line in stat_output.split("\n"):
                try:
                    jid = int(line.strip().split(' ')[0])
                    running_jids.add(str(jid))
                except ValueError:
                    # this is not a JID
                    pass
        
        now = datetime.datetime.now()
        for task in self.all_tasks_topologically_sorted:
            exec_ping_file = task.get_run().get_executing_ping_file()
            queued_ping_file = task.get_run().get_queued_ping_file()
            if os.path.exists(exec_ping_file):
                info = yaml.load(open(exec_ping_file, 'r'))
                start_time = info['start_time']
                last_activity = datetime.datetime.fromtimestamp(
                    abstract_step.AbstractStep.fsc.getmtime(exec_ping_file))
                last_activity_difference = now - last_activity
                if last_activity_difference.total_seconds() > \
                   abstract_step.AbstractStep.PING_TIMEOUT:
                    run_problems.append((task, exec_ping_file,
                                         last_activity_difference,
                                         last_activity - start_time))

            if os.path.exists(queued_ping_file) and check_queue:
                info = yaml.load(open(queued_ping_file, 'r'))
                if not str(info['job_id']) in running_jids:
                    queue_problems.append((task, queued_ping_file,
                                           info['submit_time']))
           
        show_hint = False
        
        if len(run_problems) > 0:
            show_hint = True
            label = "Warning: There are %d stale run ping files." % len(run_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                run_problems = sorted(run_problems, key=itemgetter(2, 3), reverse=True)
                for problem in run_problems:
                    task = problem[0]
                    path = problem[1]
                    last_activity_difference = problem[2]
                    ran_for = problem[3]
                    print("dead since %13s, ran for %13s: %s" % (
                        misc.duration_to_str(last_activity_difference), 
                        misc.duration_to_str(ran_for), task))
                print("")
                    
        if len(queue_problems) > 0:
            show_hint = True
            label = "Warning: There are %d tasks marked as queued, but they do not seem to be queued." % len(queue_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                queue_problems = sorted(queue_problems, key=itemgetter(2), reverse=True)
                for problem in queue_problems:
                    task = problem[0]
                    path = problem[1]
                    start_time = problem[2]
                    print("submitted at %13s: %s" % (start_time, task))
                print("")
                
        if fix_problems:
            all_problems = run_problems
            all_problems.extend(queue_problems)
            for problem in all_problems:
                path = problem[1]
                print("Now deleting %s..." % path)
                os.unlink(path)
                
        if show_hint:
            if print_more_warnings and not print_details or not fix_problems:
                print("Hint: Run 'uap %s fix-problems --details' to see the "
                      "details."  % self.config_file_name)
            if not fix_problems:
                print("Hint: Run 'uap %s fix-problems --srsly' to fix these "
                      "problems (that is, delete all problematic ping files)."
                      % self.config_file_name)
Esempio n. 2
0
def main(args):

    def exit(signum, frame):
        sys.exit(1)
    signal.signal(signal.SIGPIPE, exit)

    if args.sources:
        # print all sources (i. e. instances of AbstractSourceStep)
        args.no_tool_checks = True
        pipeline.Pipeline(arguments=args).print_source_runs()

    elif args.job_ids:
        args.no_tool_checks = True
        p = pipeline.Pipeline(arguments=args)
        ids = p.get_cluster_job_ids()
        print(' '.join(ids))
        return

    elif args.run and not args.details:
        # print run infos of one or more specific tasks
        args.no_tool_checks = True
        p = pipeline.Pipeline(arguments=args)
        for task in p.get_task_with_list():
            print('## %s\n' % task)
            report = task.get_run().as_dict(commands=True)
            print(yaml.dump(report, Dumper=misc.UAPDumper,
                            default_flow_style=False))

    elif args.graph:
        p = pipeline.Pipeline(arguments=args)
        step_order = p.topological_step_order
        indents = [0 for _ in step_order]
        for index, line in enumerate(step_order):
            step_name = step_order[index]
            child_count = len(p.steps[step_name].children_step_names)
            indent = child_count * 2
            for _ in range(index + 1, len(step_order)):
                indents[_] += indent
            lines = list()
            for index, step_name in enumerate(step_order):
                lines.append(list(' ' * indents[index]))

        # draw horizontal line parts
        for index, step_name in enumerate(step_order):
            child_order = [
                _ for _ in step_order if _ in p.steps[step_name].children_step_names]
            for child_index, child in enumerate(child_order):
                x0 = indents[index] + 1 + child_index * 2
                y = step_order.index(child)
                x1 = indents[y]
                for x in range(x0, x1):
                    lines[y][x] = "─"
                lines[y][x0 - 1] = "└"

        # draw vertical line parts
        for index, step_name in enumerate(step_order):
            child_order = [
                _ for _ in step_order if _ in p.steps[step_name].children_step_names]
            for child_index, child in enumerate(child_order):
                x = indents[index] + child_index * 2
                y0 = index + 1
                y1 = step_order.index(child)
                for y in range(y0, y1):
                    lines[y][x] = "│"

        # write step names and states
        for index, line in enumerate(lines):
            original_step_name_label = ''
            step = p.steps[step_order[index]]
            if step.get_step_name() != step.get_step_type():
                original_step_name_label = ' (%s)' % step.get_step_type()

            if args.no_tool_checks:
                run_info = '%d runs' % len(step.get_runs())
            else:
                run_info = step.get_run_info_str(
                    progress=True, do_hash=args.hash)
            line = "%s%s%s [%s]" % (''.join(line).replace(
                "─└", "─┴"), step.get_step_name(), original_step_name_label, run_info)
            print(line)

    elif args.details:
        p = pipeline.Pipeline(arguments=args)
        n_per_state = dict()
        tasks = p.get_task_with_list()
        for i, task in enumerate(tasks):
            len_tag = '[%d/%d] ' % (i + 1, len(tasks))
            sys.stdout.write(len_tag)
            state = task.get_task_state(do_hash=args.hash)
            n_per_state.setdefault(state, 0)
            n_per_state[state] += 1
            if state == p.states.FINISHED:
                message = '%s is finished' % task
                if args.hash:
                    message += ' and sha256sum(s) correct'
                print(message)

            elif state == p.states.READY:
                print('%s has all inputs and is ready to start running' % task)

            elif state == p.states.EXECUTING:
                exec_ping_file = task.get_run().get_executing_ping_file()
                try:
                    with open(exec_ping_file, 'r') as buff:
                        info = yaml.load(buff,
                                         Loader=yaml.FullLoader)
                except IOError as e:
                    if os.path.exists(exec_ping_file):
                        raise e
                    else:
                        print(
                            '%s is executing but seems to stop just now' %
                            task)
                else:
                    print('%s is executing since %s' %
                          (task, info['start_time']))

            elif state == p.states.QUEUED:
                queued_ping_file = task.get_run().get_queued_ping_file()
                try:
                    with open(queued_ping_file, 'r') as buff:
                        info = yaml.load(buff,
                                         Loader=yaml.FullLoader)
                except IOError as e:
                    if os.path.exists(queued_ping_file):
                        raise e
                    print('%s is queued but seems to stop just now' % task)
                else:
                    print('%s is queued with id %s since %s' %
                          (task, info['cluster job id'], info['submit_time']))

            elif state == p.states.VOLATILIZED:
                print('%s was volatilized and must be re-run if the data is '
                      'needed' % task)

            elif state == p.states.WAITING:
                parents = [str(parent) for parent in task.get_parent_tasks()
                           if parent.get_task_state() != p.states.FINISHED]
                print('%s is waiting for %s' % (task, parents))

            elif state == p.states.CHANGED:
                heading = 'changes in task %s' % task
                print(heading)
                print('-' * len(len_tag + heading))
                run = task.get_run()
                anno_data = run.written_anno_data()
                has_date_change = False
                has_only_date_change = True
                if not anno_data:
                    has_only_date_change = False
                    print('No annotation file.')
                else:
                    changes = run.get_changes()
                    if changes:
                        has_only_date_change = False
                        print(yaml.dump(dict(changes), Dumper=misc.UAPDumper,
                                        default_flow_style=False))
                    for line in run.file_changes(do_hash=args.hash,
                                                 report_correct=True):
                        if ' modification date after' in line:
                            has_date_change = True
                        else:
                            has_only_date_change = False
                        print(line)
                if has_date_change and has_only_date_change:
                    print("\nThis may be fixed with "
                          "'uap %s fix-problems --file-modification-date "
                          "--srsly'." %
                          args.config.name)
                print('')

            elif state == p.states.BAD:
                heading = 'errors of bad task %s' % task
                print(heading)
                print('-' * len(heading) + '\n')
                run = task.get_run()
                found_error = False
                anno_file = run.get_annotation_path()
                stale = run.is_stale()
                if stale:
                    print('Marked as executing but did not show activity '
                          'for %s.\n' % misc.duration_to_str(stale))
                anno_data = run.written_anno_data()
                if not anno_data:
                    print('No annotation file.')
                else:
                    failed = dict()
                    stderr = dict()
                    try:
                        procs = anno_data['pipeline_log'].get('processes', [])
                        for proc in procs:
                            if proc.get('exit_code', 0) == 0:
                                continue
                            failed[proc['name']] = {
                                'command': list2cmdline(proc['args']),
                                'exit code': proc['exit_code']
                            }
                            stderr[proc['name']] = proc['stderr_copy']['tail']
                            if isinstance(stderr[proc['name']], bytes):
                                stderr[proc['name']] = stderr[proc['name']]\
                                    .decode('utf-8')
                    except KeyError as e:
                        print('The annotation file "%s" seems badly '
                              'formated: %s\n' % (anno_file, e))
                    else:
                        time = anno_data.get('end_time', 'not available')
                        rd = anno_data.get('run', dict())
                        host = rd.get('hostname', 'not available')
                        job_id = rd.get('cluster job id', 'not available')
                        print('host: %s' % host)
                        print('time: %s' % time)
                        print('cluster job id: %s' % job_id)
                        print('')
                    if failed:
                        found_error = True
                        print('## FAILED COMMANDS ##')
                        for tool, failings in failed.items():
                            sys.stdout.write(yaml.dump({tool: failings},
                                                       Dumper=misc.UAPDumper,
                                                       default_flow_style=False))
                            err = stderr[tool].replace('\n', '\n    ')
                            print('  stderr: %s' % err)
                    else:
                        print('No failed commands found in the annotation file.')
                        print(
                            "Run 'uap %s fix-problems --first-error' to investigate.'\n" %
                            args.config.name)
                    run_data = anno_data.get('run', [])
                    if 'error' in run_data:
                        found_error = True
                        print('## ERROR ##\n%s\n' % run_data['error'])
                    if not found_error:
                        print('No errors found.')
                        print('')

            else:
                print('%s has unknown state "%s"' % (task, state.lower()))
            sys.stdout.flush()

        stat_counts = ["%d %s" % (n_per_state.get(state, 0), state.lower())
                       for state in p.states.order]
        print("\nruns: %d total, %s" %
              (sum(n_per_state.values()), ', '.join(stat_counts)))

        if p.states.CHANGED in n_per_state.keys():
            print(
                "If you want to force overwrite of the changed tasks, run\n"
                "'uap %s run-locally --force' or 'uap %s submit-to-cluster --force'." %
                (args.config.name, args.config.name))
            print(
                "If you want to ignore the changes and consider the tasks finished, run\n"
                "'uap %s run-locally --ignore' or 'uap %s submit-to-cluster --ignore'." %
                (args.config.name, args.config.name))

        # now check ping files and print some warnings and instructions if
        # something's fishy
        p.check_ping_files(
            print_more_warnings=True if args.verbose > 0 else False)

        # Now check whether we can volatilize files, but don't do it.
        p.check_volatile_files()

    else:
        # print all runs
        '''
        prints a summary of all runs, indicating whether each run is
        - ``[r]eady``
        - ``[w]aiting``
        - ``[q]ueued``
        - ``[e]xecuting``
        - ``[b]ad``
        - ``[f]inished``
        - ``[c]hanged``
        - ``[v]olatilized``
        '''
        p = pipeline.Pipeline(arguments=args)
        output = list()
        tasks_for_status = {}
        tasks = p.all_tasks_topologically_sorted

        task_iter = tqdm(
            tasks,
            desc='tasks',
            bar_format='{desc}:{percentage:3.0f}%|{bar:10}{r_bar}')
        try:
            for task in task_iter:
                state = task.get_task_state(do_hash=args.hash)
                tasks_for_status.setdefault(state, list())
                tasks_for_status[state].append(task)
        except BaseException:
            task_iter.close()
            raise

        for status in p.states.order:
            if status not in tasks_for_status:
                continue
            heading = "%s runs" % string.capwords(status)
            output.append(heading)
            output.append('-' * len(heading))
            if args.summarize:
                step_count = dict()
                step_order = list()
                for task in tasks_for_status[status]:
                    if not str(task.step) in step_count:
                        step_count[str(task.step)] = 0
                        step_order.append(str(task.step))
                    step_count[str(task.step)] += 1
                for step_name in step_order:
                    output.append("[%s]%4d %s"
                                  % (status.lower()[0],
                                     step_count[step_name],
                                     step_name)
                                  )
            else:
                for task in tasks_for_status[status]:
                    output.append("[%s] %s"
                                  % (
                                      status[0].lower(),
                                      task))
                output.append('')
        output.append("runs: %d total, %s"
                      % (len(p.all_tasks_topologically_sorted),
                         ', '.join(["%d %s" % (
                             len(tasks_for_status[_]),
                             _.lower()) for _ in p.states.order
                             if _ in tasks_for_status])))
        pydoc.pager("\n".join(output))

        if p.states.CHANGED in tasks_for_status.keys() \
            or p.states.BAD in tasks_for_status.keys() \
                or p.states.WAITING in tasks_for_status.keys():
            print("\nRun 'uap %s status --details' to inspect states." %
                  args.config.name)

        # now check ping files and print some warnings and instructions if
        # something's fishy
        p.check_ping_files(
            print_more_warnings=True if args.verbose > 0 else False)

        # Now check whether we can volatilize files, but don't do it.
        p.check_volatile_files()
Esempio n. 3
0
def render_pipeline_hash(log):
        
    def pid_hash(pid, suffix = ''):
        hashtag = "%s/%s/%d/%s" % (log['step']['name'], 
                                   log['run']['run_id'], 
                                   pid, suffix)
        return misc.str_to_sha1(hashtag)
        
    def file_hash(path):
        if path in log['step']['known_paths']:
            if 'real_path' in log['step']['known_paths'][path]:
                path = log['step']['known_paths'][path]['real_path']
        return misc.str_to_sha1(path)
        
        
    hash = dict()
    hash['nodes'] = dict()
    hash['edges'] = dict()
    hash['clusters'] = dict()
    hash['graph_labels'] = dict()
        
    def add_file_node(path):
        if not path in log['step']['known_paths']:
            return
                
        if 'real_path' in log['step']['known_paths'][path]:
            path = log['step']['known_paths'][path]['real_path']
        label = log['step']['known_paths'][path]['label']
        color = '#ffffff'
        if log['step']['known_paths'][path]['type'] == 'fifo':
            color = '#c4f099'
        elif log['step']['known_paths'][path]['type'] == 'file':
            color = '#8ae234'
        elif log['step']['known_paths'][path]['type'] == 'step_file':
            color = '#97b7c8'
            if path in log['step']['known_paths']:
                if 'size' in log['step']['known_paths'][path]:
                    label += "\\n%s" % misc.bytes_to_str(
                        log['step']['known_paths'][path]['size'])
        hash['nodes'][misc.str_to_sha1(path)] = {
            'label': label,
            'fillcolor': color
        }
            
    for proc_info in copy.deepcopy(log['pipeline_log']['processes']):
        pid = proc_info['pid']
        label = "PID %d" % pid
        name = '(unknown)'
        if 'name' in proc_info:
            name = proc_info['name']
        label = "%s" % (proc_info['name'])
        if 'writes' in proc_info['hints']:
            for path in proc_info['hints']['writes']:
                add_file_node(path)
        if 'args' in proc_info:
            stripped_args = []
            for arg in copy.deepcopy(proc_info['args']):
                if arg in log['step']['known_paths']:
                    add_file_node(arg)
                if arg in log['step']['known_paths']:
                    if log['step']['known_paths'][arg]['type'] != 'step_file':
                        arg = log['step']['known_paths'][arg]['label']
                    else:
                        arg = os.path.basename(arg)
                else:
                    if arg[0:4] != '/dev':
                        arg = os.path.basename(arg)
                        if (len(arg) > 16) and re.match('^[A-Z]+$', arg):
                            arg = "%s[...]" % arg[:16]
                stripped_args.append(arg.replace('\t', '\\t').replace(
                    '\\', '\\\\'))
            tw = textwrap.TextWrapper(
                width = 50, 
                break_long_words = False, 
                break_on_hyphens = False)
            label = "%s" % ("\\n".join(tw.wrap(' '.join(stripped_args))))
        if 'args' in proc_info:
            output_fifo = False
            for arg in proc_info['args']:
                fifo_type = None
                if name == 'cat4m' and arg == '-o':
                    output_fifo = True
                elif name == 'dd' and arg.startswith('of='):
                    output_fifo = True
                for known_path in log['step']['known_paths'].keys():
                    if known_path in arg:
                        add_file_node(known_path)
                        if name in ['cat4m', 'dd']:
                            if output_fifo:
                                fifo_type = 'output'
                            else:
                                fifo_type = 'input'
                        else:
                            # we can't know whether the fifo is for input or
                            # output, first look at the hints, then use the
                            # designation (if any was given)
                            if 'reads' in proc_info['hints'] and \
                               arg in proc_info['hints']['reads']:
                                fifo_type = 'input'
                                if 'writes' in proc_info['hints'] and \
                                   arg in proc_info['hints']['writes']:
                                    fifo_type = 'output'
                            if fifo_type is None:
                                fifo_type = log['step']['known_paths'][arg]\
                                            ['designation']
                        if fifo_type == 'input':
                            # add edge from file to proc
                            hash['edges'][(file_hash(arg), pid_hash(pid))] \
                                = dict()
                        elif fifo_type == 'output':
                            # add edge from proc to file
                            hash['edges'][(pid_hash(pid), file_hash(arg))] \
                                = dict()
        if 'writes' in proc_info['hints']:
            for path in proc_info['hints']['writes']:
                hash['edges'][(pid_hash(pid), file_hash(path))] = dict()
        # add proc
        something_went_wrong = False
        if 'signal' in proc_info:
            something_went_wrong = True
        elif 'exit_code' in proc_info:
            if proc_info['exit_code'] != 0:
                something_went_wrong = True
        else:
            something_went_wrong = True
        color = "#fce94f"
        if something_went_wrong:
            if not pid in log['pipeline_log']['ok_to_fail']:
                color = "#d5291a"
            if 'signal' in proc_info:
                label = "%s\\n(received %s%s)" % (
                    label, 
                    'friendly ' \
                    if pid in log['pipeline_log']['ok_to_fail'] else '',
                    proc_info['signal_name'] if 'signal_name' in \
                    proc_info else 'signal %d' % proc_info['signal'])
            elif 'exit_code' in proc_info:
                if proc_info['exit_code'] != 0:
                    label = "%s\\n(failed with exit code %d)" % (
                        label, proc_info['exit_code'])
            else:
                label = "%s\\n(no exit code)" % label
                    
        if 'max' in log['pipeline_log']['process_watcher']:
            if pid in log['pipeline_log']['process_watcher']['max']:
                label += "\\n%1.1f%% CPU, %s RAM (%1.1f%%)" % (
                    log['pipeline_log']['process_watcher']['max'][pid]\
                    ['cpu_percent'],
                    misc.bytes_to_str(
                        log['pipeline_log']['process_watcher']['max'][pid]\
                        ['rss']),
                    log['pipeline_log']['process_watcher']['max'][pid]\
                    ['memory_percent'])
                
        hash['nodes'][pid_hash(pid)] = {
            'label': label,
            'fillcolor': color
        }
            
        for which in ['stdout', 'stderr']:
            key = "%s_copy" % which
            if key in proc_info:
                if ('exit_code' in proc_info[key]) and \
                   (proc_info[key]['exit_code'] == 0) and \
                   ('length' in proc_info[key]) and \
                   (proc_info[key]['length'] == 0) and \
                   (not 'sink_full_path' in proc_info[key]):
                    # skip this stdout/stderr box if it leads to nothing
                    continue
                size_label = '(empty)'
                if ('length' in proc_info[key]) and \
                   (proc_info[key]['length'] > 0):
                    speed = float(proc_info[key]['length']) / (
                        proc_info[key]['end_time'] - 
                        proc_info[key]['start_time']).total_seconds()
                    speed_label = "%s/s" % misc.bytes_to_str(speed)
                    size_label = "%s / %s lines (%s)" % (
                        misc.bytes_to_str(proc_info[key]['length']),
                        "{:,}".format(proc_info[key]['lines']),
                        speed_label)
                label = "%s\\n%s" % (which, size_label)
                
                something_went_wrong = False
                if 'signal' in proc_info[key]:
                    something_went_wrong = True
                elif 'exit_code' in proc_info[key]:
                    if proc_info[key]['exit_code'] != 0:
                        something_went_wrong = True
                else:
                    something_went_wrong = True
                color = "#fdf3a7"
                if something_went_wrong:
                    if not pid in log['pipeline_log']['ok_to_fail']:
                        color = "#d5291a"
                    if 'signal' in proc_info[key]:
                        label = "%s\\n(received %s%s)" % (
                            label, 
                            "friendly " if pid in \
                            log['pipeline_log']['ok_to_fail'] else '',
                            proc_info[key]['signal_name'] if 'signal_name'\
                            in proc_info[key] else 'signal %d' % 
                            proc_info[key]['signal'])
                    elif 'exit_code' in proc_info[key]:
                        if proc_info[key]['exit_code'] != 0:
                            label = "%s\\n(failed with exit code %d)" % (
                                label, proc_info[key]['exit_code'])
                    else:
                        label = "%s\\n(no exit code)" % label
                        
                                
                # add proc_which
                hash['nodes'][pid_hash(pid, which)] = {
                    'label': label,
                    'fillcolor': color
                }
                if 'sink_full_path' in proc_info[key]:
                    path = proc_info[key]['sink_full_path']
                    add_file_node(path)

    for proc_info in copy.deepcopy(log['pipeline_log']['processes']):
        pid = proc_info['pid']
        if 'use_stdin_of' in proc_info:
            other_pid = proc_info['use_stdin_of']
            hash['edges'][(pid_hash(other_pid, 'stdout'), pid_hash(pid))] \
                = dict()
        for which in ['stdout', 'stderr']:
            key = "%s_copy" % which
            if key in proc_info:
                other_pid = proc_info[key]['pid']
                hash['edges'][(pid_hash(pid), pid_hash(pid, which))] = dict()
                if 'sink_full_path' in proc_info[key]:
                    hash['edges'][(
                        pid_hash(pid, which),
                        file_hash(proc_info[key]['sink_full_path']))] = dict()

    # define nodes which go into subgraph
    step_file_nodes = dict()
    for path, path_info in log['step']['known_paths'].items():
        if path_info['type'] == 'step_file':
            step_file_nodes[file_hash(path)] = path_info['designation']

    task_name = "%s/%s" % (log['step']['name'], log['run']['run_id'])
    cluster_hash = misc.str_to_sha1(task_name)
    hash['clusters'][cluster_hash] = dict()
    hash['clusters'][cluster_hash]['task_name'] = task_name
    hash['clusters'][cluster_hash]['group'] = list()
    for node in hash['nodes'].keys():
        if not node in step_file_nodes:
            hash['clusters'][cluster_hash]['group'].append(node)
                
    start_time = log['start_time']
    end_time = log['end_time']
    duration = end_time - start_time

    text = "Task: %s\\lHost: %s\\lDuration: %s\\l" % (
        task_name, socket.gethostname(),
        misc.duration_to_str(duration, long = True)
    )
    hash['graph_labels'][task_name] = text
    if 'max' in log['pipeline_log']['process_watcher']:
        text = "CPU: %1.1f%%, %d CORES_Requested , RAM: %s (%1.1f%%)\\l" % (
            log['pipeline_log']['process_watcher']['max']['sum']['cpu_percent'],
            log['step']['cores'],
            misc.bytes_to_str(log['pipeline_log']['process_watcher']['max']\
                              ['sum']['rss']), 
            log['pipeline_log']['process_watcher']['max']['sum']['memory_percent'])
        hash['graph_labels'][task_name] += text
    if 'signal' in log:
        hash['graph_labels'][task_name] += "Caught signal: %s\\l" % (
            process_pool.ProcessPool.SIGNAL_NAMES[log['signal']])
    hash['graph_labels'][task_name] += "\\l"
    return hash
Esempio n. 4
0
def create_hash_from_annotation(log):
        
    def pid_hash(pid, suffix = ''):
        hashtag = "%s/%s/%d/%s" % (log['step']['name'], 
                                   log['run']['run_id'], 
                                   pid, suffix)
        return misc.str_to_sha1(hashtag)
        
    def file_hash(path):
        if path in log['step']['known_paths']:
            if 'real_path' in log['step']['known_paths'][path]:
                path = log['step']['known_paths'][path]['real_path']
        return misc.str_to_sha1(path)
        
        
    pipe_hash = dict()
    pipe_hash['nodes'] = dict()
    pipe_hash['edges'] = dict()
    pipe_hash['clusters'] = dict()
    pipe_hash['graph_labels'] = dict()
        
    def add_file_node(path):
        if not path in log['step']['known_paths']:
            return
                
        if 'real_path' in log['step']['known_paths'][path]:
            path = log['step']['known_paths'][path]['real_path']
        label = os.path.basename(path)
        color = '#ffffff'
        if log['step']['known_paths'][path]['type'] in ['fifo', 'directory']:
            color = '#c4f099'
        elif log['step']['known_paths'][path]['type'] == 'file':
            color = '#8ae234'
        elif log['step']['known_paths'][path]['type'] == 'step_file':
            color = '#97b7c8'
            label = log['step']['known_paths'][path]['label']
            if path in log['step']['known_paths']:
                if 'size' in log['step']['known_paths'][path]:
                    label += "\\n%s" % misc.bytes_to_str(
                        log['step']['known_paths'][path]['size'])
        pipe_hash['nodes'][misc.str_to_sha1(path)] = {
            'label': label,
            'fillcolor': color
        }
            
    for proc_info in log['pipeline_log']['processes']:
        pid = proc_info['pid']
        # Set name and label variable
        try:
            name = proc_info['name']
            label = "%s" % (proc_info['name'])
        except KeyError:
            name = '(unknown)'
            label = "PID %d" % pid
            
        try:
            # Add file nodes for every file in hints
            for path in proc_info['hints']['writes']:
                add_file_node(path)
                pipe_hash['edges'][(pid_hash(pid), file_hash(path))] = dict()
        except KeyError:
            pass

        try:
            # Add all the info for each process to pipe_hash
            stripped_args = []
            is_output_file = False
            for arg in proc_info['args']:
                # Try to investigate how fifos are integrated in data stream
                # Hier muss die Entscheidung rein ob eine Datei für Input oder
                # Output genutzt wird
                io_type = None
                if name == 'cat':
                    is_output_file = False
                elif name == 'dd' and arg.startswith('of='):
                    is_output_file = True
                elif name == 'dd' and arg.startswith('if='):
                    is_output_file = False
                elif name in ['mkdir', 'mkfifo']:
                    is_output_file = True
                for known_path in log['step']['known_paths'].keys():
                    # Check if arg contains a known path ...
                    if known_path in arg:
                        # ... if so add this file to the graph 
                        add_file_node(known_path)
                        # Is the process able to in-/output files?
                        if name in ['cat', 'dd', 'mkdir', 'mkfifo']:
                            if is_output_file:
                                io_type = 'output'
                            else:
                                io_type = 'input'
                        elif name == 'fix_cutadapt.py':
                            if arg == proc_info['args'][-2]:
                                io_type = 'input'
                            elif arg == proc_info['args'][-1]:
                                io_type = 'output'
                            elif proc_info[ proc_info['args'].index(arg) - 1 ] \
                                 == '--R2-in':
                                io_type = 'input'
                            elif proc_info[ proc_info['args'].index(arg) - 1 ] \
                                 == '--R2-out':
                                io_type = 'output'
                        else:
                            # we can't know whether the fifo is for input or
                            # output, first look at the hints, then use the
                            # designation (if any was given)
                            if 'reads' in proc_info['hints'] and \
                               arg in proc_info['hints']['reads']:
                                io_type = 'input'
                            if 'writes' in proc_info['hints'] and \
                               arg in proc_info['hints']['writes']:
                                io_type = 'output'
                            if io_type is None:
                                io_type = log['step']['known_paths'][known_path]\
                                            ['designation']
                                if io_type is None:
                                    io_type = 'input'

                                print('io_type: %s\nknown_path: %s' % 
                                      (io_type, known_path)
                                )

                        if io_type == 'input':
                            # add edge from file to proc
                            pipe_hash['edges']\
                                [(file_hash(known_path),pid_hash(pid))] = dict()
                        elif io_type == 'output':
                            # add edge from proc to file
                            pipe_hash['edges']\
                                [(pid_hash(pid), file_hash(known_path))] = dict()

                        basename = os.path.basename(known_path)
                        #if log['step']['known_paths'][known_path]['type'] != \
                        #   'step_file':
                        arg = arg.replace(known_path, basename)
#                            break
#                        else:
#                            arg = basename
#                            break
                    else:
#                        if arg[0:4] != '/dev':
#                            arg = os.path.basename(arg)
                        if (len(arg) > 16) and re.match('^[A-Z]+$', arg):
                            arg = "%s[...]" % arg[:16]
                stripped_args.append(arg.replace('\t', '\\t').replace(
                    '\\', '\\\\'))
                
            tw = textwrap.TextWrapper(
                width = 50, 
                break_long_words = False, 
                break_on_hyphens = False)
            label = "%s" % ("\\n".join(tw.wrap(' '.join(stripped_args))))

        # If any key wasn't around let's go on
        except KeyError:
            pass

        # add proc
        something_went_wrong = False
        if 'signal' in proc_info:
            something_went_wrong = True
        elif 'exit_code' in proc_info:
            if proc_info['exit_code'] != 0:
                something_went_wrong = True
        else:
            something_went_wrong = True
        color = "#fce94f"
        if something_went_wrong:
            if not pid in log['pipeline_log']['ok_to_fail']:
                color = "#d5291a"
            if 'signal' in proc_info:
                label = "%s\\n(received %s%s)" % (
                    label, 
                    'friendly ' \
                    if pid in log['pipeline_log']['ok_to_fail'] else '',
                    proc_info['signal_name'] if 'signal_name' in \
                    proc_info else 'signal %d' % proc_info['signal'])
            elif 'exit_code' in proc_info:
                if proc_info['exit_code'] != 0:
                    label = "%s\\n(failed with exit code %d)" % (
                        label, proc_info['exit_code'])
            else:
                label = "%s\\n(no exit code)" % label
                    
        if 'max' in log['pipeline_log']['process_watcher']:
            if pid in log['pipeline_log']['process_watcher']['max']:
                label += "\\n%1.1f%% CPU, %s RAM (%1.1f%%)" % (
                    log['pipeline_log']['process_watcher']['max'][pid]\
                    ['cpu_percent'],
                    misc.bytes_to_str(
                        log['pipeline_log']['process_watcher']['max'][pid]\
                        ['rss']),
                    log['pipeline_log']['process_watcher']['max'][pid]\
                    ['memory_percent'])
                
        pipe_hash['nodes'][pid_hash(pid)] = {
            'label': label,
            'fillcolor': color,
            'start_time': proc_info['start_time']
        }
            
        for which in ['stdout', 'stderr']:
            key = "%s_copy" % which
            if key in proc_info:
                if ('exit_code' in proc_info[key]) and \
                   (proc_info[key]['exit_code'] == 0) and \
                   ('length' in proc_info[key]) and \
                   (proc_info[key]['length'] == 0) and \
                   (not 'sink_full_path' in proc_info[key]):
                    # skip this stdout/stderr box if it leads to nothing
                    continue
                size_label = '(empty)'
                if ('length' in proc_info[key]) and \
                   (proc_info[key]['length'] > 0):
                    speed = float(proc_info[key]['length']) / (
                        proc_info[key]['end_time'] - 
                        proc_info[key]['start_time']).total_seconds()
                    speed_label = "%s/s" % misc.bytes_to_str(speed)
                    size_label = "%s / %s lines (%s)" % (
                        misc.bytes_to_str(proc_info[key]['length']),
                        "{:,}".format(proc_info[key]['lines']),
                        speed_label)
                label = "%s\\n%s" % (which, size_label)
                
                something_went_wrong = False
                if 'signal' in proc_info[key]:
                    something_went_wrong = True
                elif 'exit_code' in proc_info[key]:
                    if proc_info[key]['exit_code'] != 0:
                        something_went_wrong = True
                else:
                    something_went_wrong = True
                color = "#fdf3a7"
                if something_went_wrong:
                    if not pid in log['pipeline_log']['ok_to_fail']:
                        color = "#d5291a"
                    if 'signal' in proc_info[key]:
                        label = "%s\\n(received %s%s)" % (
                            label, 
                            "friendly " if pid in \
                            log['pipeline_log']['ok_to_fail'] else '',
                            proc_info[key]['signal_name'] if 'signal_name'\
                            in proc_info[key] else 'signal %d' % 
                            proc_info[key]['signal'])
                    elif 'exit_code' in proc_info[key]:
                        if proc_info[key]['exit_code'] != 0:
                            label = "%s\\n(failed with exit code %d)" % (
                                label, proc_info[key]['exit_code'])
                    else:
                        label = "%s\\n(no exit code)" % label

                # add proc_which
                pipe_hash['nodes'][pid_hash(pid, which)] = {
                    'label': label,
                    'fillcolor': color
                }
                if 'sink_full_path' in proc_info[key]:
                    path = proc_info[key]['sink_full_path']
                    add_file_node(path)

    for proc_info in copy.deepcopy(log['pipeline_log']['processes']):
        pid = proc_info['pid']
        if 'use_stdin_of' in proc_info:
            other_pid = proc_info['use_stdin_of']
            pipe_hash['edges'][(pid_hash(other_pid, 'stdout'), pid_hash(pid))] \
                = dict()
        for which in ['stdout', 'stderr']:
            key = "%s_copy" % which
            if key in proc_info:
                other_pid = proc_info[key]['pid']
                pipe_hash['edges'][(pid_hash(pid), pid_hash(pid, which))] = dict()
                if 'sink_full_path' in proc_info[key]:
                    pipe_hash['edges'][(
                        pid_hash(pid, which),
                        file_hash(proc_info[key]['sink_full_path']))] = dict()

    # define nodes which go into subgraph
    step_file_nodes = dict()
    for path, path_info in log['step']['known_paths'].items():
        if path_info['type'] == 'step_file':
            step_file_nodes[file_hash(path)] = path_info['designation']

    task_name = "%s/%s" % (log['step']['name'], log['run']['run_id'])
    cluster_hash = misc.str_to_sha1(task_name)
    pipe_hash['clusters'][cluster_hash] = dict()
    pipe_hash['clusters'][cluster_hash]['task_name'] = task_name
    pipe_hash['clusters'][cluster_hash]['group'] = list()
    for node in pipe_hash['nodes'].keys():
        if not node in step_file_nodes:
            pipe_hash['clusters'][cluster_hash]['group'].append(node)
                
    start_time = log['start_time']
    end_time = log['end_time']
    duration = end_time - start_time

    text = "Task: %s\\lHost: %s\\lDuration: %s\\l" % (
        task_name, socket.gethostname(),
        misc.duration_to_str(duration, long = True)
    )
    pipe_hash['graph_labels'][task_name] = text
    if 'max' in log['pipeline_log']['process_watcher']:
        text = "CPU: %1.1f%%, %d CORES_Requested , RAM: %s (%1.1f%%)\\l" % (
            log['pipeline_log']['process_watcher']['max']['sum']['cpu_percent'],
            log['step']['cores'],
            misc.bytes_to_str(log['pipeline_log']['process_watcher']['max']\
                              ['sum']['rss']), 
            log['pipeline_log']['process_watcher']['max']['sum']['memory_percent'])
        pipe_hash['graph_labels'][task_name] += text
    if 'signal' in log:
        pipe_hash['graph_labels'][task_name] += "Caught signal: %s\\l" % (
            process_pool.ProcessPool.SIGNAL_NAMES[log['signal']])
    pipe_hash['graph_labels'][task_name] += "\\l"
    return pipe_hash
Esempio n. 5
0
File: render.py Progetto: yigbt/uap
def create_hash_from_annotation(log):

    def pid_hash(pid, suffix=''):
        hashtag = "%s/%s/%d/%s" % (log['step']['name'],
                                   log['run']['run_id'],
                                   pid, suffix)
        return misc.str_to_sha256(hashtag)

    def file_hash(path):
        if path in log['step']['known_paths']:
            if 'real_path' in log['step']['known_paths'][path]:
                path = log['step']['known_paths'][path]['real_path']
        return misc.str_to_sha256(path)

    pipe_hash = dict()
    pipe_hash['nodes'] = dict()
    pipe_hash['edges'] = dict()
    pipe_hash['clusters'] = dict()
    pipe_hash['graph_labels'] = dict()

    def add_file_node(path):
        if path not in log['step']['known_paths']:
            return

        if 'real_path' in log['step']['known_paths'][path]:
            path = log['step']['known_paths'][path]['real_path']
        label = os.path.basename(path)
        color = '#ffffff'
        if log['step']['known_paths'][path]['type'] in ['fifo', 'directory']:
            color = '#c4f099'
        elif log['step']['known_paths'][path]['type'] == 'file':
            color = '#8ae234'
        elif log['step']['known_paths'][path]['type'] == 'step_file':
            color = '#97b7c8'
            label = log['step']['known_paths'][path]['label']
            if path in log['step']['known_paths']:
                if 'size' in log['step']['known_paths'][path]:
                    label += "\\nFilesize: %s" % misc.bytes_to_str(
                        log['step']['known_paths'][path]['size'])
        pipe_hash['nodes'][misc.str_to_sha256(path)] = {
            'label': label,
            'fillcolor': color
        }

    for proc_info in log['pipeline_log']['processes']:
        pid = proc_info['pid']
        # Set name and label variable
        try:
            name = proc_info['name']
            label = "%s" % (proc_info['name'])
        except KeyError:
            name = '(unknown)'
            label = "PID %d" % pid

        try:
            # Add file nodes for every file in hints
            for path in proc_info['hints']['writes']:
                add_file_node(path)
                pipe_hash['edges'][(pid_hash(pid), file_hash(path))] = dict()
        except KeyError:
            pass

        try:
            # Add all the info for each process to pipe_hash
            stripped_args = []
            is_output_file = False
            for arg in proc_info['args']:
                # Try to investigate how fifos are integrated in data stream
                # Hier muss die Entscheidung rein ob eine Datei für Input oder
                # Output genutzt wird
                io_type = None
                if name == 'cat':
                    is_output_file = False
                elif name == 'dd' and arg.startswith('of='):
                    is_output_file = True
                elif name == 'dd' and arg.startswith('if='):
                    is_output_file = False
                elif name in ['mkdir', 'mkfifo']:
                    is_output_file = True
                for known_path in log['step']['known_paths'].keys():
                    # Check if arg contains a known path ...
                    if known_path in arg:
                        # ... if so add this file to the graph
                        add_file_node(known_path)
                        # Is the process able to in-/output files?
                        if name in ['cat', 'dd', 'mkdir', 'mkfifo']:
                            if is_output_file:
                                io_type = 'output'
                            else:
                                io_type = 'input'
                        elif name == 'fix_cutadapt.py':
                            if arg == proc_info['args'][-2]:
                                io_type = 'input'
                            elif arg == proc_info['args'][-1]:
                                io_type = 'output'
                            elif proc_info[proc_info['args'].index(arg) - 1] \
                                    == '--R2-in':
                                io_type = 'input'
                            elif proc_info[proc_info['args'].index(arg) - 1] \
                                    == '--R2-out':
                                io_type = 'output'
                        else:
                            # we can't know whether the fifo is for input or
                            # output, first look at the hints, then use the
                            # designation (if any was given)
                            if 'reads' in proc_info['hints'] and \
                               arg in proc_info['hints']['reads']:
                                io_type = 'input'
                            if 'writes' in proc_info['hints'] and \
                               arg in proc_info['hints']['writes']:
                                io_type = 'output'
                            if io_type is None:
                                io_type = log['step']['known_paths'][known_path]['designation']
                                if io_type is None:
                                    io_type = 'input'

                                msg = 'io_type: %s\nknown_path: %s'
                                print(msg % (io_type, known_path))

                        if io_type == 'input':
                            # add edge from file to proc
                            key = (file_hash(known_path), pid_hash(pid))
                            pipe_hash['edges'][key] = dict()
                        elif io_type == 'output':
                            # add edge from proc to file
                            key = (pid_hash(pid), file_hash(known_path))
                            pipe_hash['edges'][key] = dict()

                        basename = os.path.basename(known_path)
                        arg = arg.replace(known_path, basename)
                    else:
                        if (len(arg) > 16) and re.match('^[A-Z]+$', arg):
                            arg = "%s[...]" % arg[:16]
                stripped_args.append(arg.replace('\t', '\\t').replace(
                    '\\', '\\\\'))

            tw = textwrap.TextWrapper(
                width=50,
                break_long_words=False,
                break_on_hyphens=False)
            label = "%s" % ("\\n".join(tw.wrap(' '.join(stripped_args))))

            start_time = proc_info['start_time']
            end_time = proc_info['end_time']
            duration = end_time - start_time
            runtime = misc.duration_to_str(duration, long=True)
            label += '\\nRuntime: %s' % runtime

        # If any key wasn't around let's go on
        except KeyError:
            pass

        # add proc
        something_went_wrong = False
        if 'signal' in proc_info:
            something_went_wrong = True
        elif 'exit_code' in proc_info:
            if proc_info['exit_code'] != 0:
                something_went_wrong = True
        else:
            # TODO: in this case False? so pass?
            something_went_wrong = True
        color = "#fce94f"
        if something_went_wrong:
            if pid not in log['pipeline_log']['ok_to_fail']:
                color = "#d5291a"
            if 'signal' in proc_info:
                label = "%s\\n(received %s%s)" % (
                    label,
                    'friendly '
                    if pid in log['pipeline_log']['ok_to_fail']
                    else '',
                    proc_info['signal_name']
                    if 'signal_name' in proc_info
                    else 'signal %d' % proc_info['signal'])
            elif 'exit_code' in proc_info:
                if proc_info['exit_code'] != 0:
                    label = "%s\\n(failed with exit code %d)" % (
                        label, proc_info['exit_code'])
            else:
                label = "%s\\n(no exit code)" % label

        if 'max' in log['pipeline_log']['process_watcher']:
            if pid in log['pipeline_log']['process_watcher']['max']:
                label += "\\n%1.1f%% CPU, %s RAM (%1.1f%%)" % (
                    log['pipeline_log']['process_watcher']['max'][pid]['cpu_percent'],
                    misc.bytes_to_str(
                        log['pipeline_log']['process_watcher']['max'][pid]['rss']
                    ),
                    log['pipeline_log']['process_watcher']['max'][pid]['memory_percent']
                )

        pipe_hash['nodes'][pid_hash(pid)] = {
            'label': label,
            'fillcolor': color,
            'start_time': proc_info['start_time']
        }

        for which in ['stdout', 'stderr']:
            key = "%s_copy" % which
            if key in proc_info:
                if ('exit_code' in proc_info[key]) and \
                   (proc_info[key]['exit_code'] == 0) and \
                   ('length' in proc_info[key]) and \
                   (proc_info[key]['length'] == 0) and \
                   ('sink_full_path' not in proc_info[key]):
                    # skip this stdout/stderr box if it leads to nothing
                    continue
                size_label = '(empty)'
                if ('length' in proc_info[key]) and \
                   (proc_info[key]['length'] > 0):
                    speed = float(proc_info[key]['length']) / (
                        proc_info[key]['end_time'] -
                        proc_info[key]['start_time']).total_seconds()
                    speed_label = "%s/s" % misc.bytes_to_str(speed)
                    size_label = "%s / %s lines (%s)" % (
                        misc.bytes_to_str(proc_info[key]['length']),
                        "{:,}".format(proc_info[key]['lines']),
                        speed_label)
                label = "%s\\n%s" % (which, size_label)

                something_went_wrong = False
                if 'signal' in proc_info[key]:
                    something_went_wrong = True
                elif 'exit_code' in proc_info[key]:
                    if proc_info[key]['exit_code'] != 0:
                        something_went_wrong = True
                else:
                    something_went_wrong = True
                color = "#fdf3a7"
                if something_went_wrong:
                    if pid not in log['pipeline_log']['ok_to_fail']:
                        color = "#d5291a"
                    if 'signal' in proc_info[key]:
                        label = "%s\\n(received %s%s)" % (
                            label,
                            "friendly "
                            if pid in log['pipeline_log']['ok_to_fail']
                            else '',
                            proc_info[key]['signal_name']
                            if 'signal_name' in proc_info[key]
                            else 'signal %d' %
                            proc_info[key]['signal'])
                    elif 'exit_code' in proc_info[key]:
                        if proc_info[key]['exit_code'] != 0:
                            label = "%s\\n(failed with exit code %d)" % (
                                label, proc_info[key]['exit_code'])
                    else:
                        label = "%s\\n(no exit code)" % label

                # add proc_which
                pipe_hash['nodes'][pid_hash(pid, which)] = {
                    'label': label,
                    'fillcolor': color
                }
                if 'sink_full_path' in proc_info[key]:
                    path = proc_info[key]['sink_full_path']
                    add_file_node(path)

    for proc_info in copy.deepcopy(log['pipeline_log']['processes']):
        pid = proc_info['pid']
        if 'use_stdin_of' in proc_info:
            other_pid = proc_info['use_stdin_of']
            key_value = (pid_hash(other_pid, 'stdout'), pid_hash(pid))
            pipe_hash['edges'][key_value] = dict()
        for which in ['stdout', 'stderr']:
            key = "%s_copy" % which
            if key in proc_info:
                other_pid = proc_info[key]['pid']
                key_value = (pid_hash(pid), pid_hash(pid, which))
                pipe_hash['edges'][key_value] = dict()
                if 'sink_full_path' in proc_info[key]:
                    pipe_hash['edges'][(
                        pid_hash(pid, which),
                        file_hash(proc_info[key]['sink_full_path']))] = dict()

    # define nodes which go into subgraph
    step_file_nodes = dict()
    for path, path_info in log['step']['known_paths'].items():
        if path_info['type'] == 'step_file':
            step_file_nodes[file_hash(path)] = path_info['designation']

    task_name = "%s/%s" % (log['step']['name'], log['run']['run_id'])
    cluster_hash = misc.str_to_sha256(task_name)
    pipe_hash['clusters'][cluster_hash] = dict()
    pipe_hash['clusters'][cluster_hash]['task_name'] = task_name
    pipe_hash['clusters'][cluster_hash]['group'] = list()
    for node in pipe_hash['nodes'].keys():
        if node not in step_file_nodes:
            pipe_hash['clusters'][cluster_hash]['group'].append(node)

    start_time = log['start_time']
    end_time = log['end_time']
    duration = end_time - start_time

    text = "Task: %s\\lHost: %s\\lDuration: %s\\l" % (
        task_name, socket.gethostname(),
        misc.duration_to_str(duration, long=True)
    )
    pipe_hash['graph_labels'][task_name] = text
    if 'max' in log['pipeline_log']['process_watcher']:
        text = "CPU: %1.1f%%, %d CORES_Requested , RAM: %s (%1.1f%%)\\l" % (
            log['pipeline_log']['process_watcher']['max']['sum']['cpu_percent'],
            log['step']['cores'],
            misc.bytes_to_str(
                log['pipeline_log']['process_watcher']['max']['sum']['rss']
            ),
            log['pipeline_log']['process_watcher']['max']['sum']['memory_percent'])
        pipe_hash['graph_labels'][task_name] += text
    if 'signal' in log:
        pipe_hash['graph_labels'][task_name] += "Caught signal: %s\\l" % (
            process_pool.ProcessPool.SIGNAL_NAMES[log['signal']])
    pipe_hash['graph_labels'][task_name] += "\\l"
    return pipe_hash
Esempio n. 6
0
    def run(self, run_id):
        '''
        Create a temporary output directory and execute a run. After the run
        has finished, it is checked that all output files are in place and
        the output files are moved to the final output location. Finally,
        YAML annotations are written.
        '''

        # this is the run we'll execute now
        run = self.get_run(run_id)

        # create the output directory if it doesn't exist yet
        if not os.path.isdir(run.get_output_directory()):
            os.makedirs(run.get_output_directory())

        # now write the run ping file
        executing_ping_path = run.get_executing_ping_file()

        if os.path.exists(executing_ping_path):
            raise UAPError("%s/%s seems to be already running, exiting..." %
                           (self, run_id))
        queued_ping_path = run.get_queued_ping_file()
        try:
            with open(queued_ping_path, 'r') as buff:
                info = yaml.load(buff, Loader=yaml.FullLoader)
            job_id = info['cluster job id']
        except (IOError, KeyError):
            job_id = None

        # create a temporary directory for the output files
        temp_directory = run.get_temp_output_directory()
        os.makedirs(temp_directory)

        # prepare known_paths
        known_paths = dict()
        for tag, tag_info in run.get_output_files_abspath().items():
            for output_path, input_paths in tag_info.items():
                # add the real output path
                if output_path is not None and input_paths is not None:
                    known_paths[output_path] = {
                        'designation': 'output',
                        'label': os.path.basename(output_path),
                        'type': 'step_file'
                    }
                    # ...and also add the temporary output path
                    known_paths[os.path.join(
                        temp_directory, os.path.basename(output_path))] = {
                            'designation': 'output',
                            'label':
                            "%s\\n(%s)" % (os.path.basename(output_path), tag),
                            'type': 'step_file',
                            'real_path': output_path
                        }
                    for input_path in input_paths:
                        if input_path is not None:
                            known_paths[input_path] = {
                                'designation': 'input',
                                'label': os.path.basename(input_path),
                                'type': 'step_file'
                            }

        # now write the run ping file
        executing_ping_info = dict()
        executing_ping_info['start_time'] = datetime.now()
        executing_ping_info['host'] = socket.gethostname()
        executing_ping_info['pid'] = os.getpid()
        executing_ping_info['user'] = pwd.getpwuid(os.getuid())[0]
        executing_ping_info['temp_directory'] = run.get_temp_output_directory()
        if job_id:
            executing_ping_info['cluster job id'] = job_id

        with open(executing_ping_path, 'w') as f:
            f.write(yaml.dump(executing_ping_info, default_flow_style=False))

        executing_ping_pid = os.fork()
        if executing_ping_pid == 0:
            # this is the chid process
            try:
                signal.signal(signal.SIGTERM, signal.SIG_DFL)
                signal.signal(signal.SIGINT, signal.SIG_IGN)
                while True:
                    time.sleep(AbstractStep.PING_RENEW)
                    # if the executing ping file is gone and the touching
                    # operation fails, then SO BE IT!
                    os.utime(executing_ping_path, None)
            finally:
                os._exit(0)

        def kill_exec_ping():
            try:
                os.kill(executing_ping_pid, signal.SIGTERM)
                os.waitpid(executing_ping_pid, 0)
            except OSError:
                # if the ping process was already killed, it's gone anyway
                pass
            self.remove_ping_file(executing_ping_path)

        p = self.get_pipeline()

        def ping_on_term(signum, frame):
            logger.warning('Recived SIGTERM and moving execution ping file...')
            kill_exec_ping()
            self.remove_ping_file(queued_ping_path, bad_copy=True)
            p.caught_signal = signum
            process_pool.ProcessPool.kill()
            raise UAPError('Recived TERM signal (canceled job).')

        def ping_on_int(signum, frame):
            logger.warning('Recived SIGINT and moving execution ping file...')
            kill_exec_ping()
            self.remove_ping_file(queued_ping_path, bad_copy=True)
            p.caught_signal = signum
            process_pool.ProcessPool.kill()
            raise UAPError('Recived INT signal (keybord interrupt).')

        original_term_handler = signal.signal(signal.SIGTERM, ping_on_term)
        original_int_handler = signal.signal(signal.SIGINT, ping_on_int)

        self.start_time = datetime.now()
        message = "[START] starting %s/%s on %s" % \
            (self, run_id, socket.gethostname())
        if job_id:
            message += " with job id %s" % job_id
        p.notify(message)
        caught_exception = None
        self._state = AbstractStep.states.EXECUTING
        base_working_dir = os.getcwd()
        os.chdir(run.get_temp_output_directory())
        try:
            self.execute(run_id, run)
        except BaseException:
            # Oh my. We have a situation. This is awkward. Tell the process
            # pool to wrap up. This way, we can try to get process stats before
            # shutting everything down.
            process_pool.ProcessPool.kill()
            # Store the exception, re-raise it later
            caught_exception = sys.exc_info()
            error = ''.join(
                traceback.format_exception(*caught_exception)[-2:]).strip()
            logger.debug(error)
        finally:
            signal.signal(signal.SIGTERM, original_term_handler)
            signal.signal(signal.SIGINT, original_int_handler)
            self._state = AbstractStep.states.DEFAULT  # changes relative paths
            os.chdir(base_working_dir)

        self.end_time = datetime.now()
        # step has completed invalidate the FS cache because things have
        # changed by now...
        run.reset_fsc()

        to_be_moved = dict()
        if not p.caught_signal and not caught_exception:
            # if we're here, we can assume the step has finished successfully
            # now log file stats

            try:
                for tag in run.get_output_files().keys():
                    for out_file in run.get_output_files()[tag].keys():
                        # don't try to rename files if they were not meant to exist
                        # in our temporary directory
                        # 1. out_file should not be None (empty output connection)
                        # 2. out_file should not contain a '/' (file belongs to a
                        #    source step)
                        if out_file is None or '/' in out_file:
                            continue
                        source_path = os.path.join(
                            run.get_temp_output_directory(),
                            os.path.basename(out_file))
                        new_path = os.path.join(run.get_output_directory(),
                                                os.path.basename(out_file))
                        # first, delete a possibly existing volatile placeholder
                        # file
                        path_volatile = new_path + AbstractStep.VOLATILE_SUFFIX
                        if os.path.exists(path_volatile):
                            logger.info("Now deleting: %s" % path_volatile)
                            os.unlink(path_volatile)
                        if os.path.exists(source_path):
                            known_paths.pop(source_path, None)
                            known_paths.setdefault(new_path, dict())
                            if known_paths[new_path][
                                    'designation'] == 'output':
                                to_be_moved[source_path] = new_path
                                size = run.fsc.getsize(source_path)
                                mtime = datetime.fromtimestamp(
                                    run.fsc.getmtime(source_path))
                                known_paths[new_path]['size'] = size
                                known_paths[new_path][
                                    'modification time'] = mtime
                            if known_paths[new_path]['type'] != 'step_file':
                                logger.debug(
                                    "Set %s 'type' info to 'step_file'" %
                                    new_path)
                                known_paths[new_path]['type'] = 'step_file'
                        else:
                            raise UAPError(
                                'The step failed to produce an '
                                'announced output file: "%s".\n'
                                'Source file doesn\'t exists: "%s"' %
                                (out_file, source_path))
            except BaseException:
                caught_exception = sys.exc_info()

        pool = None

        class SignalError(Exception):
            def __init__(self, signum):
                self.signum = signum
                m = "Recived signal %s during hashing!" % \
                    process_pool.ProcessPool.SIGNAL_NAMES[signum]
                super(SignalError, self).__init__(m)

        if caught_exception is None and to_be_moved:
            p.notify("[INFO] %s/%s hashing %d output file(s)." %
                     (str(self), run_id, len(to_be_moved)))
            if p.has_interactive_shell() \
                    and logger.getEffectiveLevel() > 20:
                show_progress = True
            else:
                show_progress = False
            try:

                def stop(signum, frame):
                    raise SignalError(signum)

                original_term_handler = signal.signal(signal.SIGTERM, stop)
                original_int_handler = signal.signal(signal.SIGINT, stop)
                pool = multiprocessing.Pool(self.get_cores())
                total = len(to_be_moved)
                file_iter = pool.imap(misc.sha_and_file, to_be_moved.keys())
                file_iter = tqdm(
                    file_iter,
                    total=total,
                    leave=False,
                    bar_format='{desc}:{percentage:3.0f}%|{bar:10}{r_bar}',
                    disable=not show_progress,
                    desc='files')
                for i, (hashsum, path) in enumerate(file_iter):
                    run.fsc.sha256sum_of(to_be_moved[path], value=hashsum)
                    known_paths[to_be_moved[path]]['sha256'] = hashsum
                    if not show_progress:
                        logger.info("sha256 [%d/%d] %s %s" %
                                    (i + 1, total, hashsum, path))
            except BaseException:
                caught_exception = sys.exc_info()
                try:
                    # removing the progress bar
                    file_iter.close()
                except BaseException:
                    pass
                error = caught_exception[1]
                if caught_exception[0] is SignalError:
                    p.caught_signal = error.signum
                logger.error(error)
                if pool:
                    pool.terminate()
            else:
                pool.close()
            signal.signal(signal.SIGTERM, original_term_handler)
            signal.signal(signal.SIGINT, original_int_handler)

        run.add_known_paths(known_paths)
        if not p.caught_signal and not caught_exception:
            try:
                for source_path, new_path in to_be_moved.items():
                    logger.debug("Moving %s to %s." % (source_path, new_path))
                    os.rename(source_path, new_path)
            except BaseException:
                caught_exception = sys.exc_info()

        error = None
        if p.caught_signal is not None:
            signum = p.caught_signal
            signame = process_pool.ProcessPool.SIGNAL_NAMES[signum]
            error = 'Pipeline stopped because it caught signal %d - %s' % \
                    (signum, signame)
        elif caught_exception is not None:
            error = ''.join(
                traceback.format_exception(*caught_exception)[-2:]).strip()
        annotation_path = run.write_annotation_file(run.get_output_directory(),
                                                    error=error,
                                                    job_id=job_id)

        kill_exec_ping()
        self._state = AbstractStep.states.DEFAULT

        if error:
            message = "[BAD] %s/%s failed on %s after %s\n" % \
                      (str(self), run_id, socket.gethostname(),
                       misc.duration_to_str(self.end_time - self.start_time))
            message += "Here are the details: " + annotation_path + '\n'
            attachment = None
            if os.path.exists(annotation_path + '.png'):
                attachment = dict()
                attachment['name'] = 'details.png'
                attachment['data'] = open(annotation_path + '.png').read()
            p.notify(message, attachment)
            self.remove_ping_file(queued_ping_path, bad_copy=True)
            if caught_exception is not None:
                raise caught_exception[1].with_traceback(caught_exception[2])

        else:
            # finally, remove the temporary directory if it's empty
            try:
                os.rmdir(temp_directory)
            except OSError as e:
                logger.info('Coult not remove temp dir "%s": %s' %
                            (temp_directory, e))
            temp = os.path.normpath(os.path.join(temp_directory, '..'))
            try:
                os.rmdir(temp)
            except OSError:
                # there may still be tasks in process
                pass

            remaining_task_info = self.get_run_info_str()

            message = "[OK] %s/%s successfully finished on %s after %s\n" % \
                      (str(self), run_id, socket.gethostname(),
                       misc.duration_to_str(self.end_time - self.start_time))
            message += str(self) + ': ' + remaining_task_info + "\n"
            attachment = None
            if os.path.exists(annotation_path + '.png'):
                attachment = dict()
                attachment['name'] = 'details.png'
                attachment['data'] = open(annotation_path + '.png').read()
            p.notify(message, attachment)
            self.remove_ping_file(queued_ping_path)

            self._reset()

        if pool is not None:
            pool.join()
Esempio n. 7
0
    def check_ping_files(self,
                         print_more_warnings=False,
                         print_details=False,
                         fix_problems=False):
        run_problems = list()
        queue_problems = list()
        check_queue = True

        try:
            stat_output = subprocess.check_output(
                [self.get_cluster_command('stat')], stderr=subprocess.STDOUT)
        except (KeyError, OSError, subprocess.CalledProcessError):
            # we don't have a stat tool here, if subprocess.CalledProcessError
            # is raised
            check_queue = False

        if print_more_warnings and not check_queue:
            try:
                ce = self.get_cluster_command('stat')
            except KeyError:
                ce = "a cluster engine"
            print(
                "Attention, we cannot check stale queued ping files because "
                "this host does not have %s." % ce)

        running_jids = set()

        if check_queue:
            for line in stat_output.split("\n"):
                try:
                    jid = int(line.strip().split(' ')[0])
                    running_jids.add(str(jid))
                except ValueError:
                    # this is not a JID
                    pass

        now = datetime.datetime.now()
        for task in self.all_tasks_topologically_sorted:
            exec_ping_file = task.get_run().get_executing_ping_file()
            queued_ping_file = task.get_run().get_queued_ping_file()
            if os.path.exists(exec_ping_file):
                info = yaml.load(open(exec_ping_file, 'r'))
                start_time = info['start_time']
                last_activity = datetime.datetime.fromtimestamp(
                    abstract_step.AbstractStep.fsc.getmtime(exec_ping_file))
                last_activity_difference = now - last_activity
                if last_activity_difference.total_seconds() > \
                   abstract_step.AbstractStep.PING_TIMEOUT:
                    run_problems.append(
                        (task, exec_ping_file, last_activity_difference,
                         last_activity - start_time))

            if os.path.exists(queued_ping_file) and check_queue:
                info = yaml.load(open(queued_ping_file, 'r'))
                if not str(info['job_id']) in running_jids:
                    queue_problems.append(
                        (task, queued_ping_file, info['submit_time']))

        show_hint = False

        if len(run_problems) > 0:
            show_hint = True
            label = "Warning: There are %d stale run ping files." % len(
                run_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                run_problems = sorted(run_problems,
                                      key=itemgetter(2, 3),
                                      reverse=True)
                for problem in run_problems:
                    task = problem[0]
                    path = problem[1]
                    last_activity_difference = problem[2]
                    ran_for = problem[3]
                    print("dead since %13s, ran for %13s: %s" %
                          (misc.duration_to_str(last_activity_difference),
                           misc.duration_to_str(ran_for), task))
                print("")

        if len(queue_problems) > 0:
            show_hint = True
            label = "Warning: There are %d tasks marked as queued, but they do not seem to be queued." % len(
                queue_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                queue_problems = sorted(queue_problems,
                                        key=itemgetter(2),
                                        reverse=True)
                for problem in queue_problems:
                    task = problem[0]
                    path = problem[1]
                    start_time = problem[2]
                    print("submitted at %13s: %s" % (start_time, task))
                print("")

        if fix_problems:
            all_problems = run_problems
            all_problems.extend(queue_problems)
            for problem in all_problems:
                path = problem[1]
                print("Now deleting %s..." % path)
                os.unlink(path)

        if show_hint:
            if print_more_warnings and not print_details or not fix_problems:
                print(
                    "Hint: Run 'uap %s fix-problems --details' to see the "
                    "details." % self.get_config_filepath())
            if not fix_problems:
                print(
                    "Hint: Run 'uap %s fix-problems --srsly' to fix these "
                    "problems (that is, delete all problematic ping files)." %
                    self.get_config_filepath())
Esempio n. 8
0
    def check_ping_files(self,
                         print_more_warnings=False,
                         print_details=False,
                         fix_problems=False):
        run_problems = list()
        queue_problems = list()
        bad_problems = list()
        check_queue = True

        try:
            stat_output = subprocess.check_output(
                [self.get_cluster_command('stat')],
                stderr=subprocess.STDOUT).decode('utf-8')
        except (KeyError, OSError, subprocess.CalledProcessError):
            # we don't have a stat tool here, if subprocess.CalledProcessError
            # is raised
            check_queue = False

        if print_more_warnings and not check_queue:
            try:
                ce = self.get_cluster_command('stat')
            except KeyError:
                ce = "a cluster engine"
            print("Attention, we cannot check stale queued ping files because "
                  "this host does not have %s." % ce)

        running_jids = set()

        if check_queue:
            for line in stat_output.split("\n"):
                if 'COMPLETING' in line:
                    # this is sluem specific and if a closing job is stuck
                    continue
                try:
                    jid = int(line.strip().split(' ')[0].split('_')[0])
                    running_jids.add(str(jid))
                except ValueError:
                    # this is not a JID
                    pass

        for task in self.all_tasks_topologically_sorted:
            queued_ping_file = task.get_run().get_queued_ping_file()
            bad_queued_ping_file = queued_ping_file + '.bad'
            exec_ping_file = task.get_run().get_executing_ping_file()
            stale = task.get_run().is_stale()
            if stale:
                try:
                    info = yaml.load(open(exec_ping_file, 'r'),
                                     Loader=yaml.FullLoader)
                except IOError as e:
                    if os.path.exists(exec_ping_file):
                        raise e
                else:
                    start_time = info['start_time']
                    last_activity = datetime.datetime.fromtimestamp(
                        task.get_run().fsc.getmtime(exec_ping_file))
                    run_problems.append((task, exec_ping_file, stale,
                                         last_activity - start_time))
            if check_queue:
                try:
                    info = yaml.load(open(queued_ping_file, 'r'),
                                     Loader=yaml.FullLoader)
                except IOError as e:
                    if os.path.exists(queued_ping_file):
                        raise e
                else:
                    if not str(info['cluster job id']) in running_jids:
                        queue_problems.append(
                            (task, queued_ping_file, info['submit_time'],
                             info['cluster job id']))
            try:
                info = yaml.load(open(bad_queued_ping_file, 'r'),
                                 Loader=yaml.FullLoader)
            except IOError as e:
                if os.path.exists(bad_queued_ping_file):
                    raise e
            else:
                bad_problems.append(
                    (task, bad_queued_ping_file, info['submit_time'],
                     info['cluster job id']))

        show_hint = False

        if len(run_problems) > 0:
            show_hint = True
            label = "Warning: There are %d stale run ping files." % len(
                run_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                run_problems = sorted(run_problems,
                                      key=itemgetter(2, 3),
                                      reverse=True)
                for problem in run_problems:
                    task = problem[0]
                    path = problem[1]
                    last_activity_difference = problem[2]
                    ran_for = problem[3]
                    print("dead since %13s, ran for %13s: %s" %
                          (misc.duration_to_str(last_activity_difference),
                           misc.duration_to_str(ran_for), task))
                print("")

        if len(queue_problems) > 0:
            show_hint = True
            label = "Warning: There are %d tasks marked as queued, but they "\
                    "do not seem to be queued." % len(queue_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                queue_problems = sorted(queue_problems,
                                        key=itemgetter(2),
                                        reverse=True)
                for problem in queue_problems:
                    task = problem[0]
                    path = problem[1]
                    start_time = problem[2]
                    job_id = problem[3]
                    print("submitted job %s at %13s: %s" %
                          (job_id, start_time, task))
                print("")

        if len(bad_problems) > 0:
            label = "Info: Found %d queue files of failed tasks." % len(
                bad_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                bad_problems = sorted(bad_problems,
                                      key=itemgetter(2),
                                      reverse=True)
                for problem in bad_problems:
                    task = problem[0]
                    path = problem[1]
                    start_time = problem[2]
                    job_id = problem[3]
                    print("submitted job %s at %13s: %s" %
                          (job_id, start_time, task))
                print("")

        if fix_problems:
            all_problems = run_problems
            all_problems.extend(queue_problems)
            all_problems.extend(bad_problems)
            for problem in all_problems:
                path = problem[1]
                print("Now deleting %s..." % path)
                os.unlink(path)

        if show_hint:
            if print_more_warnings and not print_details or not fix_problems:
                print("Hint: Run 'uap %s fix-problems --details' to see the "
                      "details." % self.args.config.name)
            if print_more_warnings and not fix_problems:
                print("Hint: Run 'uap %s fix-problems --first-error' to "
                      "investigate what happended." % self.args.config.name)
            if not fix_problems:
                print(
                    "Hint: Run 'uap %s fix-problems --srsly' to fix these "
                    "problems (that is, delete all problematic ping files)." %
                    self.args.config.name)
        else:
            print('No problematic ping files were found.')