def _get_all_pages(self): got = 0 last_id = 0 filters = [['object_uuid', '=', self.job_uuid], ['event_type', 'in', self.event_types]] try: while True: page = arvados.api().logs().index( limit=1000, order=['id asc'], filters=filters + [['id', '>', str(last_id)]], select=['id', 'properties'], ).execute(num_retries=2) got += len(page['items']) logger.debug( '%s: received %d of %d log events', self.job_uuid, got, got + page['items_available'] - len(page['items'])) for i in page['items']: for line in i['properties']['text'].split('\n'): self._queue.put(line + '\n') last_id = i['id'] if (len(page['items']) == 0 or len(page['items']) >= page['items_available']): break finally: self._queue.put(self.EOF)
def __init__(self, logdata, label=None, skip_child_jobs=False): self._logdata = logdata self.label = label self.starttime = None self.finishtime = None self._skip_child_jobs = skip_child_jobs # stats_max: {category: {stat: val}} self.stats_max = collections.defaultdict( functools.partial(collections.defaultdict, lambda: 0)) # task_stats: {task_id: {category: {stat: val}}} self.task_stats = collections.defaultdict( functools.partial(collections.defaultdict, dict)) self.seq_to_uuid = {} self.tasks = collections.defaultdict(Task) # We won't bother recommending new runtime constraints if the # constraints given when running the job are known to us and # are already suitable. If applicable, the subclass # constructor will overwrite this with something useful. self.existing_constraints = {} logger.debug("%s: logdata %s", self.label, logdata)
def _get_all_pages(self): got = 0 last_id = 0 filters = [ ['object_uuid', '=', self.job_uuid], ['event_type', '=', 'stderr']] try: while True: page = arvados.api().logs().index( limit=1000, order=['id asc'], filters=filters + [['id','>',str(last_id)]], select=['id', 'properties'], ).execute(num_retries=2) got += len(page['items']) logger.debug( '%s: received %d of %d log events', self.job_uuid, got, got + page['items_available'] - len(page['items'])) for i in page['items']: for line in i['properties']['text'].split('\n'): self._queue.put(line+'\n') last_id = i['id'] if (len(page['items']) == 0 or len(page['items']) >= page['items_available']): break finally: self._queue.put(self.EOF)
def __init__(self, logdata, label=None, skip_child_jobs=False, uuid=None, **kwargs): self._logdata = logdata self.uuid = uuid self.label = label self.starttime = None self.finishtime = None self._skip_child_jobs = skip_child_jobs # stats_max: {category: {stat: val}} self.stats_max = collections.defaultdict( functools.partial(collections.defaultdict, lambda: 0)) # task_stats: {task_id: {category: {stat: val}}} self.task_stats = collections.defaultdict( functools.partial(collections.defaultdict, dict)) self.seq_to_uuid = {} self.tasks = collections.defaultdict(Task) # We won't bother recommending new runtime constraints if the # constraints given when running the job are known to us and # are already suitable. If applicable, the subclass # constructor will overwrite this with something useful. self.existing_constraints = {} logger.debug("%s: logdata %s", self.label, logdata)
def __init__(self, collection_id): logger.debug('load collection %s', collection_id) collection = arvados.collection.CollectionReader(collection_id) filenames = [filename for filename in collection] if len(filenames) != 1: raise ValueError( "collection {} has {} files; need exactly one".format( collection_id, len(filenames))) self._reader = collection.open(filenames[0]) self._label = "{}/{}".format(collection_id, filenames[0])
def __iter__(self): logger.debug('load collection %s', self._collection_id) collection = arvados.collection.CollectionReader(self._collection_id) filenames = [filename for filename in collection] if len(filenames) == 1: filename = filenames[0] else: filename = 'crunchstat.txt' self._label = "{}/{}".format(self._collection_id, filename) self._reader = collection.open(filename) return iter(self._reader)
def __iter__(self): logger.debug('load collection %s', self._collection_id) collection = arvados.collection.CollectionReader(self._collection_id) filenames = [filename for filename in collection] # Crunch2 has multiple stats files if len(filenames) > 1: filenames = ['crunchstat.txt', 'arv-mount.txt'] for filename in filenames: try: self._readers.append(collection.open(filename)) except IOError: logger.warn('Unable to open %s', filename) self._label = "{}/{}".format(self._collection_id, filenames[0]) return itertools.chain(*[iter(reader) for reader in self._readers])
def __init__(self, root, skip_child_jobs=False, **kwargs): arv = arvados.api('v1', model=OrderedJsonModel()) label = kwargs.pop('label', None) or root.get('name') or root['uuid'] root['name'] = label children = collections.OrderedDict() todo = collections.deque((root, )) while len(todo) > 0: current = todo.popleft() label = current['name'] sort_key = current['created_at'] if current['uuid'].find('-xvhdp-') > 0: current = arv.containers().get( uuid=current['container_uuid']).execute() summer = ContainerSummarizer(current, label=label, **kwargs) summer.sort_key = sort_key children[current['uuid']] = summer page_filters = [] while True: child_crs = arv.container_requests().index( order=['uuid asc'], filters=page_filters + [['requesting_container_uuid', '=', current['uuid']]], ).execute() if not child_crs['items']: break elif skip_child_jobs: logger.warning( '%s: omitting stats from %d child containers' ' because --skip-child-jobs flag is on', label, child_crs['items_available']) break page_filters = [['uuid', '>', child_crs['items'][-1]['uuid']]] for cr in child_crs['items']: if cr['container_uuid']: logger.debug('%s: container req %s', current['uuid'], cr['uuid']) cr['name'] = cr.get('name') or cr['uuid'] todo.append(cr) sorted_children = collections.OrderedDict() for uuid in sorted(list(children.keys()), key=lambda uuid: children[uuid].sort_key): sorted_children[uuid] = children[uuid] super(ContainerTreeSummarizer, self).__init__(children=sorted_children, label=root['name'], **kwargs)
def __init__(self, root, skip_child_jobs=False, **kwargs): arv = arvados.api('v1', model=OrderedJsonModel()) label = kwargs.pop('label', None) or root.get('name') or root['uuid'] root['name'] = label children = collections.OrderedDict() todo = collections.deque((root, )) while len(todo) > 0: current = todo.popleft() label = current['name'] sort_key = current['created_at'] if current['uuid'].find('-xvhdp-') > 0: current = arv.containers().get(uuid=current['container_uuid']).execute() summer = ContainerSummarizer(current, label=label, **kwargs) summer.sort_key = sort_key children[current['uuid']] = summer page_filters = [] while True: child_crs = arv.container_requests().index( order=['uuid asc'], filters=page_filters+[ ['requesting_container_uuid', '=', current['uuid']]], ).execute() if not child_crs['items']: break elif skip_child_jobs: logger.warning('%s: omitting stats from %d child containers' ' because --skip-child-jobs flag is on', label, child_crs['items_available']) break page_filters = [['uuid', '>', child_crs['items'][-1]['uuid']]] for cr in child_crs['items']: if cr['container_uuid']: logger.debug('%s: container req %s', current['uuid'], cr['uuid']) cr['name'] = cr.get('name') or cr['uuid'] todo.append(cr) sorted_children = collections.OrderedDict() for uuid in sorted(children.keys(), key=lambda uuid: children[uuid].sort_key): sorted_children[uuid] = children[uuid] super(ContainerTreeSummarizer, self).__init__( children=sorted_children, label=root['name'], **kwargs)
def run(self): logger.debug("%s: parsing logdata %s", self.label, self._logdata) with self._logdata as logdata: self._run(logdata)
def __init__(self, job_uuid): self.job_uuid = job_uuid self.event_types = (['stderr'] if '-8i9sb-' in job_uuid else ['crunchstat', 'arv-mount']) logger.debug('load %s events for job %s', self.event_types, self.job_uuid)
def __init__(self, job_uuid): logger.debug('load stderr events for job %s', job_uuid) self.job_uuid = job_uuid
def run(self): logger.debug("%s: parsing logdata %s", self.label, self._logdata) for line in self._logdata: m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) job_task (?P<task_uuid>\S+)$', line) if m: seq = int(m.group('seq')) uuid = m.group('task_uuid') self.seq_to_uuid[seq] = uuid logger.debug('%s: seq %d is task %s', self.label, seq, uuid) continue m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) (success in|failure \(#., permanent\) after) (?P<elapsed>\d+) seconds', line) if m: task_id = self.seq_to_uuid[int(m.group('seq'))] elapsed = int(m.group('elapsed')) self.task_stats[task_id]['time'] = {'elapsed': elapsed} if elapsed > self.stats_max['time']['elapsed']: self.stats_max['time']['elapsed'] = elapsed continue m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) stderr Queued job (?P<uuid>\S+)$', line) if m: uuid = m.group('uuid') if self._skip_child_jobs: logger.warning('%s: omitting stats from child job %s' ' because --skip-child-jobs flag is on', self.label, uuid) continue logger.debug('%s: follow %s', self.label, uuid) child_summarizer = JobSummarizer(uuid) child_summarizer.stats_max = self.stats_max child_summarizer.task_stats = self.task_stats child_summarizer.tasks = self.tasks child_summarizer.starttime = self.starttime child_summarizer.run() logger.debug('%s: done %s', self.label, uuid) continue m = re.search(r'^(?P<timestamp>[^\s.]+)(\.\d+)? (?P<job_uuid>\S+) \d+ (?P<seq>\d+) stderr crunchstat: (?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n', line) if not m: continue if self.label is None: self.label = m.group('job_uuid') logger.debug('%s: using job uuid as label', self.label) if m.group('category').endswith(':'): # "stderr crunchstat: notice: ..." continue elif m.group('category') in ('error', 'caught'): continue elif m.group('category') == 'read': # "stderr crunchstat: read /proc/1234/net/dev: ..." # (crunchstat formatting fixed, but old logs still say this) continue task_id = self.seq_to_uuid[int(m.group('seq'))] task = self.tasks[task_id] # Use the first and last crunchstat timestamps as # approximations of starttime and finishtime. timestamp = datetime.datetime.strptime( m.group('timestamp'), '%Y-%m-%d_%H:%M:%S') if not task.starttime: task.starttime = timestamp logger.debug('%s: task %s starttime %s', self.label, task_id, timestamp) task.finishtime = timestamp if not self.starttime: self.starttime = timestamp self.finishtime = timestamp this_interval_s = None for group in ['current', 'interval']: if not m.group(group): continue category = m.group('category') words = m.group(group).split(' ') stats = {} for val, stat in zip(words[::2], words[1::2]): try: if '.' in val: stats[stat] = float(val) else: stats[stat] = int(val) except ValueError as e: raise ValueError( 'Error parsing {} stat in "{}": {!r}'.format( stat, line, e)) if 'user' in stats or 'sys' in stats: stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0) if 'tx' in stats or 'rx' in stats: stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0) for stat, val in stats.iteritems(): if group == 'interval': if stat == 'seconds': this_interval_s = val continue elif not (this_interval_s > 0): logger.error( "BUG? interval stat given with duration {!r}". format(this_interval_s)) continue else: stat = stat + '__rate' val = val / this_interval_s if stat in ['user+sys__rate', 'tx+rx__rate']: task.series[category, stat].append( (timestamp - self.starttime, val)) else: if stat in ['rss']: task.series[category, stat].append( (timestamp - self.starttime, val)) self.task_stats[task_id][category][stat] = val if val > self.stats_max[category][stat]: self.stats_max[category][stat] = val logger.debug('%s: done parsing', self.label) self.job_tot = collections.defaultdict( functools.partial(collections.defaultdict, int)) for task_id, task_stat in self.task_stats.iteritems(): for category, stat_last in task_stat.iteritems(): for stat, val in stat_last.iteritems(): if stat in ['cpus', 'cache', 'swap', 'rss']: # meaningless stats like 16 cpu cores x 5 tasks = 80 continue self.job_tot[category][stat] += val logger.debug('%s: done totals', self.label)
def _run(self, logdata): self.detected_crunch1 = False for line in logdata: if not self.detected_crunch1 and '-8i9sb-' in line: self.detected_crunch1 = True if self.detected_crunch1: m = re.search( r'^\S+ \S+ \d+ (?P<seq>\d+) job_task (?P<task_uuid>\S+)$', line) if m: seq = int(m.group('seq')) uuid = m.group('task_uuid') self.seq_to_uuid[seq] = uuid logger.debug('%s: seq %d is task %s', self.label, seq, uuid) continue m = re.search( r'^\S+ \S+ \d+ (?P<seq>\d+) (success in|failure \(#., permanent\) after) (?P<elapsed>\d+) seconds', line) if m: task_id = self.seq_to_uuid[int(m.group('seq'))] elapsed = int(m.group('elapsed')) self.task_stats[task_id]['time'] = {'elapsed': elapsed} if elapsed > self.stats_max['time']['elapsed']: self.stats_max['time']['elapsed'] = elapsed continue m = re.search( r'^\S+ \S+ \d+ (?P<seq>\d+) stderr Queued job (?P<uuid>\S+)$', line) if m: uuid = m.group('uuid') if self._skip_child_jobs: logger.warning( '%s: omitting stats from child job %s' ' because --skip-child-jobs flag is on', self.label, uuid) continue logger.debug('%s: follow %s', self.label, uuid) child_summarizer = ProcessSummarizer(uuid) child_summarizer.stats_max = self.stats_max child_summarizer.task_stats = self.task_stats child_summarizer.tasks = self.tasks child_summarizer.starttime = self.starttime child_summarizer.run() logger.debug('%s: done %s', self.label, uuid) continue # 2017-12-02_17:15:08 e51c5-8i9sb-mfp68stkxnqdd6m 63676 0 stderr crunchstat: keepcalls 0 put 2576 get -- interval 10.0000 seconds 0 put 2576 get m = re.search( r'^(?P<timestamp>[^\s.]+)(\.\d+)? (?P<job_uuid>\S+) \d+ (?P<seq>\d+) stderr (?P<crunchstat>crunchstat: )(?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n$', line) if not m: continue else: # crunch2 # 2017-12-01T16:56:24.723509200Z crunchstat: keepcalls 0 put 3 get -- interval 10.0000 seconds 0 put 3 get m = re.search( r'^(?P<timestamp>\S+) (?P<crunchstat>crunchstat: )?(?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n$', line) if not m: continue if self.label is None: try: self.label = m.group('job_uuid') except IndexError: self.label = 'container' if m.group('category').endswith(':'): # "stderr crunchstat: notice: ..." continue elif m.group('category') in ('error', 'caught'): continue elif m.group('category') in ('read', 'open', 'cgroup', 'CID', 'Running'): # "stderr crunchstat: read /proc/1234/net/dev: ..." # (old logs are less careful with unprefixed error messages) continue if self.detected_crunch1: task_id = self.seq_to_uuid[int(m.group('seq'))] else: task_id = 'container' task = self.tasks[task_id] # Use the first and last crunchstat timestamps as # approximations of starttime and finishtime. timestamp = m.group('timestamp') if timestamp[10:11] == '_': timestamp = datetime.datetime.strptime(timestamp, '%Y-%m-%d_%H:%M:%S') elif timestamp[10:11] == 'T': timestamp = datetime.datetime.strptime(timestamp[:19], '%Y-%m-%dT%H:%M:%S') else: raise ValueError( "Cannot parse timestamp {!r}".format(timestamp)) if task.starttime is None: logger.debug('%s: task %s starttime %s', self.label, task_id, timestamp) if task.starttime is None or timestamp < task.starttime: task.starttime = timestamp if task.finishtime is None or timestamp > task.finishtime: task.finishtime = timestamp if self.starttime is None or timestamp < task.starttime: self.starttime = timestamp if self.finishtime is None or timestamp < task.finishtime: self.finishtime = timestamp if ( not self.detected_crunch1 ) and task.starttime is not None and task.finishtime is not None: elapsed = (task.finishtime - task.starttime).seconds self.task_stats[task_id]['time'] = {'elapsed': elapsed} if elapsed > self.stats_max['time']['elapsed']: self.stats_max['time']['elapsed'] = elapsed this_interval_s = None for group in ['current', 'interval']: if not m.group(group): continue category = m.group('category') words = m.group(group).split(' ') stats = {} try: for val, stat in zip(words[::2], words[1::2]): if '.' in val: stats[stat] = float(val) else: stats[stat] = int(val) except ValueError as e: # If the line doesn't start with 'crunchstat:' we # might have mistaken an error message for a # structured crunchstat line. if m.group("crunchstat") is None or m.group( "category") == "crunchstat": logger.warning("%s: log contains message\n %s", self.label, line) else: logger.warning( '%s: Error parsing value %r (stat %r, category %r): %r', self.label, val, stat, category, e) logger.warning('%s', line) continue if 'user' in stats or 'sys' in stats: stats['user+sys'] = stats.get('user', 0) + stats.get( 'sys', 0) if 'tx' in stats or 'rx' in stats: stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0) for stat, val in stats.items(): if group == 'interval': if stat == 'seconds': this_interval_s = val continue elif not (this_interval_s > 0): logger.error( "BUG? interval stat given with duration {!r}". format(this_interval_s)) continue else: stat = stat + '__rate' val = val / this_interval_s if stat in ['user+sys__rate', 'tx+rx__rate']: task.series[category, stat].append( (timestamp - self.starttime, val)) else: if stat in ['rss']: task.series[category, stat].append( (timestamp - self.starttime, val)) self.task_stats[task_id][category][stat] = val if val > self.stats_max[category][stat]: self.stats_max[category][stat] = val logger.debug('%s: done parsing', self.label) self.job_tot = collections.defaultdict( functools.partial(collections.defaultdict, int)) for task_id, task_stat in self.task_stats.items(): for category, stat_last in task_stat.items(): for stat, val in stat_last.items(): if stat in ['cpus', 'cache', 'swap', 'rss']: # meaningless stats like 16 cpu cores x 5 tasks = 80 continue self.job_tot[category][stat] += val logger.debug('%s: done totals', self.label)
def _run(self, logdata): self.detected_crunch1 = False for line in logdata: if not self.detected_crunch1 and '-8i9sb-' in line: self.detected_crunch1 = True if self.detected_crunch1: m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) job_task (?P<task_uuid>\S+)$', line) if m: seq = int(m.group('seq')) uuid = m.group('task_uuid') self.seq_to_uuid[seq] = uuid logger.debug('%s: seq %d is task %s', self.label, seq, uuid) continue m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) (success in|failure \(#., permanent\) after) (?P<elapsed>\d+) seconds', line) if m: task_id = self.seq_to_uuid[int(m.group('seq'))] elapsed = int(m.group('elapsed')) self.task_stats[task_id]['time'] = {'elapsed': elapsed} if elapsed > self.stats_max['time']['elapsed']: self.stats_max['time']['elapsed'] = elapsed continue m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) stderr Queued job (?P<uuid>\S+)$', line) if m: uuid = m.group('uuid') if self._skip_child_jobs: logger.warning('%s: omitting stats from child job %s' ' because --skip-child-jobs flag is on', self.label, uuid) continue logger.debug('%s: follow %s', self.label, uuid) child_summarizer = ProcessSummarizer(uuid) child_summarizer.stats_max = self.stats_max child_summarizer.task_stats = self.task_stats child_summarizer.tasks = self.tasks child_summarizer.starttime = self.starttime child_summarizer.run() logger.debug('%s: done %s', self.label, uuid) continue # 2017-12-02_17:15:08 e51c5-8i9sb-mfp68stkxnqdd6m 63676 0 stderr crunchstat: keepcalls 0 put 2576 get -- interval 10.0000 seconds 0 put 2576 get m = re.search(r'^(?P<timestamp>[^\s.]+)(\.\d+)? (?P<job_uuid>\S+) \d+ (?P<seq>\d+) stderr (?P<crunchstat>crunchstat: )(?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n$', line) if not m: continue else: # crunch2 # 2017-12-01T16:56:24.723509200Z crunchstat: keepcalls 0 put 3 get -- interval 10.0000 seconds 0 put 3 get m = re.search(r'^(?P<timestamp>\S+) (?P<crunchstat>crunchstat: )?(?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n$', line) if not m: continue if self.label is None: try: self.label = m.group('job_uuid') except IndexError: self.label = 'container' if m.group('category').endswith(':'): # "stderr crunchstat: notice: ..." continue elif m.group('category') in ('error', 'caught'): continue elif m.group('category') in ('read', 'open', 'cgroup', 'CID', 'Running'): # "stderr crunchstat: read /proc/1234/net/dev: ..." # (old logs are less careful with unprefixed error messages) continue if self.detected_crunch1: task_id = self.seq_to_uuid[int(m.group('seq'))] else: task_id = 'container' task = self.tasks[task_id] # Use the first and last crunchstat timestamps as # approximations of starttime and finishtime. timestamp = m.group('timestamp') if timestamp[10:11] == '_': timestamp = datetime.datetime.strptime( timestamp, '%Y-%m-%d_%H:%M:%S') elif timestamp[10:11] == 'T': timestamp = datetime.datetime.strptime( timestamp[:19], '%Y-%m-%dT%H:%M:%S') else: raise ValueError("Cannot parse timestamp {!r}".format( timestamp)) if task.starttime is None: logger.debug('%s: task %s starttime %s', self.label, task_id, timestamp) if task.starttime is None or timestamp < task.starttime: task.starttime = timestamp if task.finishtime is None or timestamp > task.finishtime: task.finishtime = timestamp if self.starttime is None or timestamp < task.starttime: self.starttime = timestamp if self.finishtime is None or timestamp < task.finishtime: self.finishtime = timestamp if (not self.detected_crunch1) and task.starttime is not None and task.finishtime is not None: elapsed = (task.finishtime - task.starttime).seconds self.task_stats[task_id]['time'] = {'elapsed': elapsed} if elapsed > self.stats_max['time']['elapsed']: self.stats_max['time']['elapsed'] = elapsed this_interval_s = None for group in ['current', 'interval']: if not m.group(group): continue category = m.group('category') words = m.group(group).split(' ') stats = {} try: for val, stat in zip(words[::2], words[1::2]): if '.' in val: stats[stat] = float(val) else: stats[stat] = int(val) except ValueError as e: # If the line doesn't start with 'crunchstat:' we # might have mistaken an error message for a # structured crunchstat line. if m.group("crunchstat") is None or m.group("category") == "crunchstat": logger.warning("%s: log contains message\n %s", self.label, line) else: logger.warning( '%s: Error parsing value %r (stat %r, category %r): %r', self.label, val, stat, category, e) logger.warning('%s', line) continue if 'user' in stats or 'sys' in stats: stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0) if 'tx' in stats or 'rx' in stats: stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0) for stat, val in stats.iteritems(): if group == 'interval': if stat == 'seconds': this_interval_s = val continue elif not (this_interval_s > 0): logger.error( "BUG? interval stat given with duration {!r}". format(this_interval_s)) continue else: stat = stat + '__rate' val = val / this_interval_s if stat in ['user+sys__rate', 'tx+rx__rate']: task.series[category, stat].append( (timestamp - self.starttime, val)) else: if stat in ['rss']: task.series[category, stat].append( (timestamp - self.starttime, val)) self.task_stats[task_id][category][stat] = val if val > self.stats_max[category][stat]: self.stats_max[category][stat] = val logger.debug('%s: done parsing', self.label) self.job_tot = collections.defaultdict( functools.partial(collections.defaultdict, int)) for task_id, task_stat in self.task_stats.iteritems(): for category, stat_last in task_stat.iteritems(): for stat, val in stat_last.iteritems(): if stat in ['cpus', 'cache', 'swap', 'rss']: # meaningless stats like 16 cpu cores x 5 tasks = 80 continue self.job_tot[category][stat] += val logger.debug('%s: done totals', self.label)