def sort_by_size(self, group_limit=None, discard_others=False, others_label='others'): """ sorts the groups by the number of elements they contain, descending. Also has option to limit the number of groups. If this option is chosen, the remaining elements are placed into another group with the name specified with others_label. if discard_others is True, the others group is removed instead. """ # sort groups by number of elements self.groups = OrderedDict( sorted(self.groups.iteritems(), key=lambda x: len(x[1]), reverse=True) ) # if group-limit is provided, combine remaining groups if group_limit != None: # now group together all groups that did not make the limit if not discard_others: group_keys = self.groups.keys()[ group_limit-1: ] self.groups.setdefault(others_label, list()) else: group_keys = self.groups.keys()[ group_limit: ] # only go to second last (-1), since the 'others' group is now last for g in group_keys: if not discard_others: self.groups[others_label].extend(self.groups[g]) del self.groups[g] # remove if empty if others_label in self.groups and len(self.groups[others_label]) == 0: del self.groups[others_label] # remove others group regardless of limit if requested if discard_others and others_label in self.groups: del self.groups[others_label]
class ChooseOperator(BaseOperator): dict_format = True names = ['$choose'] defaults = OrderedDict([('from', []), ('weights', None)]) def __call__(self, options=None): # options can be arbitrary long list, store as "from" in options dictionary if isinstance(options, list): options = {'from': options} options = self._parse_options(options) # decode ratio weights = self._decode(options['weights']) if not weights: # pick one choice, uniformly distributed, but don't evaluate yet return choice(options['from']) else: assert len(weights) == len(options['from']) total_weight = 0 acc_weight_items = [] for item, weight in zip(options['from'], weights): total_weight += weight acc_weight_items.append((total_weight, item)) pick = random() * total_weight for weight, item in acc_weight_items: if weight >= pick: return item
class DateTimeOperator(BaseOperator): dict_format = True string_format = True names = ['$datetime', '$date'] defaults = OrderedDict([('min', 0), ('max', int(time.time()))]) def _parse_dt(self, input): """ parse input, either int (epoch) or date string (use dateutil parser). """ if isinstance(input, str): # string needs conversion, try parsing with dateutil's parser try: dt = parser.parse(input) except Exception as e: raise SystemExit("can't parse date/time format for %s." % input) td = dt - datetime.utcfromtimestamp(0) return int((td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6) else: return int(input) def __call__(self, options=None): options = self._parse_options(options) # decode min and max and convert time formats to epochs mintime = self._parse_dt(self._decode(options['min'])) maxtime = self._parse_dt(self._decode(options['max'])) # generate random epoch number epoch = randint(mintime, maxtime) return datetime.fromtimestamp(epoch)
class ObjectIdOperator(DateTimeOperator): """ with no parameters, just generate a new ObjectId. If min and/or max are provided, handle like DateTimeOperator and replace the timestamp portion in the ObjectId with the random date and time. """ names = ['$objectid', '$oid'] defaults = OrderedDict([('min', None), ('max', None)]) def __call__(self, options=None): options = self._parse_options(options) mintime = self._decode(options['min']) maxtime = self._decode(options['max']) if (mintime == None and maxtime == None): return ObjectId() # decode min and max and convert time formats to epochs mintime = self._parse_dt(mintime or 0) maxtime = self._parse_dt(maxtime or time.time()) assert mintime <= maxtime # generate random epoch number epoch = randint(mintime, maxtime) oid = struct.pack(">i", int(epoch)) + ObjectId().binary[4:] return ObjectId(oid)
def run(self): """Run this section and print out information.""" grouping = Grouping( group_by=lambda x: (x.datetime, x.cursorid, x.reapedtime)) logfile = self.mloginfo.logfile if logfile.start and logfile.end: progress_start = self.mloginfo._datetime_to_epoch(logfile.start) progress_total = (self.mloginfo._datetime_to_epoch(logfile.end) - progress_start) else: self.mloginfo.progress_bar_enabled = False for i, le in enumerate(logfile): # update progress bar every 1000 lines if self.mloginfo.progress_bar_enabled and (i % 1000 == 0): if le.datetime: progress_curr = self.mloginfo._datetime_to_epoch( le.datetime) if progress_total: (self.mloginfo.update_progress( float(progress_curr - progress_start) / progress_total)) if 'Cursor id' in le.line_str: lt = LogTuple(le.datetime, le.cursor, le._reapedtime) grouping.add(lt) grouping.sort_by_size() # clear progress bar again if self.mloginfo.progress_bar_enabled: self.mloginfo.update_progress(1.0) # no cursor information in the log file if not len(grouping): print('no cursor information found.') return titles = ['datetime', 'cursorid', 'reapedtime'] table_rows = [] # using only important key-values for g in grouping: # calculate statistics for this group datetime, cursorid, reapedtime = g stats = OrderedDict() stats['datetime'] = str(datetime) stats['cursorid'] = str(cursorid) stats['reapedtime'] = str(reapedtime) table_rows.append(stats) print_table(table_rows, titles, uppercase_headers=True) print('')
def __init__(self, args=None, unknown_args=None): self.args = args self.unknown_args = unknown_args self.groups = OrderedDict() self.empty = True self.limits = None if self.args["optime_start"]: self.xlabel = "time (start of ops)" else: self.xlabel = "time (end of ops)"
def __init__(self, args=None, unknown_args=None): self.args = args self.unknown_args = unknown_args self.groups = OrderedDict() self.empty = True self.limits = None if self.args['optime_start']: self.xlabel = 'time (start of ops)' else: self.xlabel = 'time (end of ops)'
class ArrayOperator(BaseOperator): dict_format = True names = ['$array'] defaults = OrderedDict([('of', None), ('number', 10)]) def __call__(self, options=None): options = self._parse_options(options) # evaluate number number = self._decode(options['number']) # build array of 'of' elements, but don't evaluate them yet return [options['of']] * number
def group(self): """ (re-)group all loglines by the given group. """ if hasattr(self, "group_by"): group_by = self.group_by else: group_by = self.default_group_by if self.args["group"] != None: group_by = self.args["group"] groups = OrderedDict() for logline in self.loglines: # if group_by is a function, call on logline if hasattr(group_by, "__call__"): key = group_by(logline) # if the logline has attribute of group_by, use that as key elif group_by and hasattr(logline, group_by): key = getattr(logline, group_by) # if the PlotType has a method with the name of group_by call that on logline elif group_by and hasattr(self, group_by): f = getattr(self, group_by) key = f(logline) # if a --label was given, use that as key elif self.args and self.args["label"]: key = self.args["label"] # else key is None else: key = None # special case: group together all connections if group_by == "thread" and key and key.startswith("conn"): key = "conn####" groups.setdefault(key, list()).append(logline) self.groups = groups
class ZipfOperator(BaseOperator): dict_format = True string_format = True names = ['$zipf', '$zeta'] defaults = OrderedDict([('alpha', 2.0)]) def __call__(self, options=None): options = self._parse_options(options) # decode distribution parameter alpha = self._decode(options['alpha']) val = zipf(alpha) - 1 return val
def group(self): """ (re-)group all loglines by the given group. """ if hasattr(self, 'group_by'): group_by = self.group_by else: group_by = self.default_group_by if self.args['group'] != None: group_by = self.args['group'] groups = OrderedDict() for logline in self.loglines: # if group_by is a function, call on logline if hasattr(group_by, '__call__'): key = group_by(logline) # if the logline has attribute of group_by, use that as key elif group_by and hasattr(logline, group_by): key = getattr(logline, group_by) # if the PlotType has a method with the name of group_by call that on logline elif group_by and hasattr(self, group_by): f = getattr(self, group_by) key = f(logline) # if a --label was given, use that as key # elif self.args and self.args['label']: # key = self.args['label'] # else key is None else: key = None # special case: group together all connections # if group_by == "thread" and key and key.startswith("conn"): # key = "conn####" groups.setdefault(key, list()).append(logline) self.groups = groups
class PointOperator(BaseOperator): dict_format = True string_format = True names = ['$point'] defaults = OrderedDict([ ('long_lim', [-180, 180]), ('lat_lim', [-90, 90]) ]) def __call__(self, options=None): options = self._parse_options(options) # evaluate limits long_lim = self._decode(options['long_lim']) lat_lim = self._decode(options['lat_lim']) # return coordinate by using random numbers between limits return { "type": "Point", "coordinates": { "$coord": [long_lim, lat_lim] } }
def run(self): """Run this section and print out information.""" titles = ['date', 'host', 'state/message'] table_rows = [] for host, state, logevent in self.mloginfo.logfile.rs_state: stats = OrderedDict() stats['date'] = logevent.datetime.strftime("%b %d %H:%M:%S") stats['host'] = host stats['state/message'] = state table_rows.append(stats) print_table(table_rows, titles, uppercase_headers=False) if len(self.mloginfo.logfile.rs_state) == 0: print(" no rs state changes found")
class CoordinateOperator(BaseOperator): dict_format = True string_format = True names = ['$coordinates', '$coordinate', '$coord', '$geo'] defaults = OrderedDict([('long_lim', [-180, 180]), ('lat_lim', [-90, 90])]) def __call__(self, options=None): options = self._parse_options(options) # evaluate limits long_lim = self._decode(options['long_lim']) lat_lim = self._decode(options['lat_lim']) # return coordinate by using random numbers between limits return [{"$float": long_lim}, {"$float": lat_lim}]
class NumberOperator(BaseOperator): dict_format = True string_format = True names = ['$number', '$num'] defaults = OrderedDict([('min', 0), ('max', 100)]) def __call__(self, options=None): options = self._parse_options(options) # decode min and max first minval = self._decode(options['min']) maxval = self._decode(options['max']) assert minval <= maxval return randint(minval, maxval)
class GaussOperator(BaseOperator): dict_format = True string_format = True names = ['$gauss', '$normal'] defaults = OrderedDict([('mean', 0.0), ('std', 1.0)]) def __call__(self, options=None): options = self._parse_options(options) # decode mean and standard deviation mu = self._decode(options['mean']) sigma = self._decode(options['std']) val = gauss(mu, sigma) return val
class FloatOperator(BaseOperator): dict_format = True string_format = True names = ['$float'] defaults = OrderedDict([('min', 0.0), ('max', 1.0)]) def __call__(self, options=None): options = self._parse_options(options) # decode min and max first minval = self._decode(options['min']) maxval = self._decode(options['max']) assert minval <= maxval val = random() * (maxval - minval) + minval return val
class PickOperator(BaseOperator): dict_format = True string_format = False names = ['$pick'] defaults = OrderedDict([('array', []), ('element', 0)]) def __call__(self, options=None): options = self._parse_options(options) # decode choices and weights array = self._decode(options['array']) element = self._decode(options['element']) if len(array) <= element: return '$missing' return array[element]
class IncOperator(BaseOperator): dict_format = True string_format = True names = ['$inc'] defaults = OrderedDict([('start', 0), ('step', 1)]) def __init__(self, decode_method): self.counter = None BaseOperator.__init__(self, decode_method) def __call__(self, options=None): options = self._parse_options(options) # initialize counter on first use (not threadsafe!) if self.counter == None: self.counter = itertools.count(options['start'], options['step']) return self.counter.next()
class MissingOperator(BaseOperator): dict_format = True string_format = True names = ['$missing'] defaults = OrderedDict([('percent', 100), ('ifnot', None)]) def __call__(self, options=None): options = self._parse_options(options) # evaluate percent percent = self._decode(options['percent']) if randint(1, 100) <= percent: return '$missing' else: # ifnot is not yet evaluated, leave that up to another operator return options['ifnot']
class ConcatOperator(BaseOperator): dict_format = True names = ['$concat'] defaults = OrderedDict([('items', []), ('sep', '')]) def __call__(self, options=None): # options can be arbitrary long list, store as "items" in options dictionary if isinstance(options, list): options = {'items': options} options = self._parse_options(options) # evaluate items items = self._decode(options['items']) # separator sep = self._decode(options['sep']) # return concatenated string return sep.join(str(i) for i in items)
class BinaryOperator(BaseOperator): dict_format = True string_format = True names = ['$bin'] defaults = OrderedDict([('length', 10), ('type', 0)]) def __call__(self, options=None): options = self._parse_options(options) # evaluate limits length = self._decode(options['length']) bintype = self._decode(options['type']) # return coordinate by using random numbers between limits assert length > 0 bindata = ''.join( choice(string.ascii_letters + string.digits) for i in xrange(length)) return Binary(bindata, bintype)
class StringOperator(BaseOperator): dict_format = True string_format = True names = ['$string', '$str'] defaults = OrderedDict([ ('length', 10), ('mask', None) ]) def __call__(self, options=None): options = self._parse_options(options) # decode min and max first length = self._decode(options['length']) mask = self._decode(options['mask']) if mask == None: mask = '.' * length assert length > 0 result = ''.join( choice(string.ascii_letters + string.digits) for i in xrange(length) ) return result
class BaseOperator(object): names = [] dict_format = False string_format = False defaults = OrderedDict() def __init__(self, decode_method): self._decode = decode_method def _parse_options(self, options={}): parsed = self.defaults.copy() if isinstance(options, list): parsed.update(zip(self.defaults.keys(), options)) elif isinstance(options, dict): parsed.update(options) for k, v in parsed.iteritems(): if isinstance(v, unicode): parsed[k] = v.encode('utf-8') return parsed
class DateTimeFilter(BaseFilter): """ This filter has two parser arguments: --from and --to, both are optional. All possible values for --from and --to can be described as: [DATE] [TIME] [OFFSET] in that order, separated by a space. [DATE] can be any of - a 3-letter weekday (Mon, Tue, Wed, ...) - a date as 3-letter month, 1-2 digits day (Sep 5, Jan 31, Aug 08) - the words: today, now, start, end [TIME] can be any of - hours and minutes (20:15, 04:00, 3:00) - hours, minutes and seconds (13:30:01, 4:55:55) [OFFSET] consists of [OPERATOR][VALUE][UNIT] (no spaces in between) [OPERATOR] can be + or - (note that - can only be used if the whole "[DATE] [TIME] [OFFSET]" is in quotation marks, otherwise it would be confused with a separate parameter) [VALUE] can be any number [UNIT] can be any of s, sec, m, min, h, hours, d, days, w, weeks, mo, months, y, years The [OFFSET] is added/subtracted to/from the specified [DATE] [TIME]. For the --from parameter, the default is the same as 'start' (0001-01-01 00:00:00). If _only_ an [OFFSET] is given, it is added to 'start' (which is not very useful). For the --to parameter, the default is the same as 'end' (9999-31-12 23:59:59). If _only_ an [OFFSET] is given, however, it is added to [FROM]. Examples: --from Sun 10:00 goes from last Sunday 10:00:00am to the end of the file --from Sep 29 goes from Sep 29 00:00:00 to the end of the file --to today 15:00 goes from the beginning of the file to today at 15:00:00 --from today --to +1h goes from today's date 00:00:00 to today's date 01:00:00 --from 20:15 --to +3m goes from today's date at 20:15:00 to today's date at 20:18:00 """ filterArgs = [('--from', { 'action': 'store', 'type': custom_parse_dt, 'nargs': '*', 'default': 'start', 'help': 'output starting at FROM', 'dest': 'from' }), ('--to', { 'action': 'store', 'type': custom_parse_dt, 'nargs': '*', 'default': 'end', 'help': 'output up to TO', 'dest': 'to' })] timeunits = [ 's', 'sec', 'm', 'min', 'h', 'hours', 'd', 'days', 'w', 'weeks', 'mo', 'months', 'y', 'years' ] weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] dtRegexes = OrderedDict([ ('weekday', r'|'.join(weekdays)), # weekdays: see above ('date', '(' + '|'.join(months) + ')' + r'\s+\d{1,2}'), # month + day: Jan 5, Oct 13, Sep 03, ... ('word', r'now|start|end|today'), ('time2', r'\d{1,2}:\d{2,2}'), # 11:59, 1:13, 00:00, ... ('time3', r'\d{1,2}:\d{2,2}:\d{2,2}'), # 11:59:00, 1:13:12, 00:00:59, ... ('offset', r'[\+-]\d+(' + '|'.join(timeunits) + ')'), # offsets: +3min, -20s, +7days, ... ]) def __init__(self, commandLineArgs): BaseFilter.__init__(self, commandLineArgs) self.fromReached = False self.toReached = False if 'from' in self.commandLineArgs or 'to' in self.commandLineArgs: self.active = True def setup(self): """ get start end end date of logfile before starting to parse. """ logfile = self.commandLineArgs['logfile'] seekable = False if logfile: seekable = logfile.name != "<stdin>" if not seekable: # assume this year (we have no other info) now = datetime.now() self.startDateTime = datetime(now.year, 1, 1) self.endDateTime = datetime(MAXYEAR, 12, 31) # self.fromDateTime = datetime(MINYEAR, 1, 1) # self.toDateTime = datetime(MAXYEAR, 12, 31) else: # get start datetime for line in logfile: logline = LogLine(line) date = logline.datetime if date: break self.startDateTime = date # get end datetime (lines are at most 10k, go back 15k at most to make sure) logfile.seek(0, 2) file_size = logfile.tell() logfile.seek(-min(file_size, 15000), 2) for line in reversed(logfile.readlines()): logline = LogLine(line) date = logline.datetime if date: break self.endDateTime = date # if there was a roll-over, subtract 1 year from start time if self.endDateTime < self.startDateTime: self.startDateTime = self.startDateTime.replace( year=self.startDateTime.year - 1) # reset logfile logfile.seek(0) # now parse for further changes to from and to datetimes dtbound = DateTimeBoundaries(self.startDateTime, self.endDateTime) self.fromDateTime, self.toDateTime = dtbound( self.commandLineArgs['from'] or None, self.commandLineArgs['to'] or None) def accept(self, logline): dt = logline.datetime # if logline has no datetime, accept if between --from and --to if dt == None: return self.fromReached if self.fromDateTime <= dt <= self.toDateTime: self.toReached = False self.fromReached = True return True elif dt > self.toDateTime: self.toReached = True return False else: return False def skipRemaining(self): return self.toReached
class BasePlotType(object): colors = ['k', 'b', 'g', 'r', 'c', 'm', 'y'] color_index = 0 markers = ['o', 's', '<', 'D'] marker_index = 0 sort_order = 0 plot_type_str = 'base' default_group_by = None # set group_by in sub-classes to force a group_by as below # group_by = 'example' def __init__(self, args=None, unknown_args=None): self.args = args self.unknown_args = unknown_args self.groups = OrderedDict() self.empty = True self.limits = None def accept_line(self, logline): """ return True if this PlotType can plot this line. """ return True def add_line(self, logline): """ append log line to this plot type. """ key = None self.empty = False self.groups.setdefault(key, list()).append(logline) @property def loglines(self): """ iterator yielding all loglines from groups dictionary. """ for key in self.groups: for logline in self.groups[key]: yield logline @classmethod def color_map(cls, group): color = cls.colors[cls.color_index] cls.color_index += 1 marker = cls.markers[cls.marker_index] if cls.color_index >= len(cls.colors): cls.marker_index += 1 cls.marker_index %= len(cls.markers) cls.color_index %= cls.color_index return color, marker def group(self): """ (re-)group all loglines by the given group. """ if hasattr(self, 'group_by'): group_by = self.group_by else: group_by = self.default_group_by if self.args['group'] != None: group_by = self.args['group'] groups = OrderedDict() for logline in self.loglines: # if group_by is a function, call on logline if hasattr(group_by, '__call__'): key = group_by(logline) # if the logline has attribute of group_by, use that as key elif group_by and hasattr(logline, group_by): key = getattr(logline, group_by) # if the PlotType has a method with the name of group_by call that on logline elif group_by and hasattr(self, group_by): f = getattr(self, group_by) key = f(logline) # if a --label was given, use that as key # elif self.args and self.args['label']: # key = self.args['label'] # else key is None else: key = None # special case: group together all connections # if group_by == "thread" and key and key.startswith("conn"): # key = "conn####" groups.setdefault(key, list()).append(logline) self.groups = groups def plot_group(self, group, idx, axis): raise NotImplementedError( "BasePlotType can't plot. Use a derived class instead") def plot(self, axis, ith_plot, total_plots, limits): self.limits = limits artists = [] print self.plot_type_str.upper(), "plot" print "%5s %9s %s" % ("id", " #points", "group") for idx, group in enumerate(self.groups): print "%5s %9s %s" % (idx + 1, len(self.groups[group]), group) group_artists = self.plot_group(group, idx + ith_plot, axis) if isinstance(group_artists, list): artists.extend(group_artists) else: artists.append(group_artists) print return artists
def __init__(self, args=None): self.args = args self.groups = OrderedDict() self.empty = True
def run(self): """ run this section and print out information. """ grouping = Grouping(group_by=lambda x: (x.namespace, x.pattern)) logfile = self.mloginfo.logfile if logfile.start and logfile.end: progress_start = self.mloginfo._datetime_to_epoch(logfile.start) progress_total = self.mloginfo._datetime_to_epoch( logfile.end) - progress_start else: self.progress_bar_enabled = False for i, le in enumerate(logfile): # update progress bar every 1000 lines if self.progress_bar_enabled and (i % 1000 == 0): if le.datetime: progress_curr = self.mloginfo._datetime_to_epoch( le.datetime) self.mloginfo.update_progress( float(progress_curr - progress_start) / progress_total) if le.operation in ['query', 'update', 'remove']: grouping.add(le) grouping.sort_by_size() # clear progress bar again self.mloginfo.update_progress(1.0) titles = [ 'namespace', 'pattern', 'count', 'min (ms)', 'max (ms)', 'mean (ms)', 'sum (ms)' ] table_rows = [] for g in grouping: # calculate statistics for this group namespace, pattern = g group_events = [ le.duration for le in grouping[g] if le.duration != None ] stats = OrderedDict() stats['namespace'] = namespace stats['pattern'] = pattern stats['count'] = len(group_events) stats['min'] = min(group_events) if group_events else '-' stats['max'] = max(group_events) if group_events else '-' stats['mean'] = 0 stats['sum'] = sum(group_events) if group_events else '-' stats['mean'] = stats['sum'] / stats[ 'count'] if group_events else '-' if self.mloginfo.args['verbose']: stats['example'] = grouping[g][0] titles.append('example') table_rows.append(stats) table_rows = sorted(table_rows, key=itemgetter('sum'), reverse=True) print_table(table_rows, titles, uppercase_headers=False) print
def __init__(self, args=None, unknown_args=None): self.args = args self.unknown_args = unknown_args self.groups = OrderedDict() self.empty = True self.limits = None
class BasePlotType(object): # 14 most distinguishable colors, according to # http://stackoverflow.com/questions/309149/generate-distinctly-different-rgb-colors-in-graphs colors = ['#000000','#00FF00','#0000FF','#FF0000','#01FFFE','#FFA6FE','#FFDB66','#006401', \ '#010067','#95003A','#007DB5','#FF00F6','#FFEEE8','#774D00'] color_index = 0 markers = ['o', 's', '<', 'D'] marker_index = 0 sort_order = 0 plot_type_str = 'base' default_group_by = None date_range = (datetime(MAXYEAR, 12, 31), datetime(MINYEAR, 1, 1)) # set group_by in sub-classes to force a group_by as below # group_by = 'example' def __init__(self, args=None, unknown_args=None): self.args = args self.unknown_args = unknown_args self.groups = OrderedDict() self.empty = True self.limits = None def accept_line(self, logline): """ return True if this PlotType can plot this line. """ return True def add_line(self, logline): """ append log line to this plot type. """ key = None self.empty = False self.groups.setdefault(key, list()).append(logline) @property def loglines(self): """ iterator yielding all loglines from groups dictionary. """ for key in self.groups: for logline in self.groups[key]: yield logline @classmethod def color_map(cls, group): color = cls.colors[cls.color_index] cls.color_index += 1 marker = cls.markers[cls.marker_index] if cls.color_index >= len(cls.colors): cls.marker_index += 1 cls.marker_index %= len(cls.markers) cls.color_index %= cls.color_index return color, marker def group(self): """ (re-)group all loglines by the given group. """ if hasattr(self, 'group_by'): group_by = self.group_by else: group_by = self.default_group_by if self.args['group'] != None: group_by = self.args['group'] groups = OrderedDict() for logline in self.loglines: if self.args['optime_start']: self.xlabel = 'time (start of ops)' else: self.xlabel = 'time (end of ops)' # if group_by is a function, call on logline if hasattr(group_by, '__call__'): key = group_by(logline) # if the logline has attribute of group_by, use that as key elif group_by and hasattr(logline, group_by): key = getattr(logline, group_by) # if the PlotType has a method with the name of group_by call that on logline elif group_by and hasattr(self, group_by): f = getattr(self, group_by) key = f(logline) # if a --label was given, use that as key # elif self.args and self.args['label']: # key = self.args['label'] # else key is None else: key = None # try to match as regular expression if type(group_by) == types.StringType: match = re.search(group_by, logline.line_str) if match: if len(match.groups()) > 0: key = match.group(1) else: key = match.group() # special case: group together all connections # if group_by == "thread" and key and key.startswith("conn"): # key = "conn####" groups.setdefault(key, list()).append(logline) # sort groups by number of data points groups = OrderedDict( sorted(groups.iteritems(), key=lambda x: len(x[1]), reverse=True) ) # if --group-limit is provided, combine remaining groups if self.args['group_limit']: group_label = 'all others combined' # now group together all groups that did not make the limit groups[group_label] = [] # only go to second last (-1), since the 'other' group is now last for other_group in groups.keys()[ self.args['group_limit']:-1 ]: groups[group_label].extend(groups[other_group]) del groups[other_group] # remove if empty if len(groups[group_label]) == 0: del groups[group_label] self.groups = groups def plot_group(self, group, idx, axis): raise NotImplementedError("BasePlotType can't plot. Use a derived class instead") def plot(self, axis, ith_plot, total_plots, limits): self.limits = limits artists = [] print self.plot_type_str.upper(), "plot" print "%5s %9s %s"%("id", " #points", "group") for idx, group in enumerate(self.groups): print "%5s %9s %s"%(idx+1, len(self.groups[group]), group) group_artists = self.plot_group(group, idx+ith_plot, axis) if isinstance(group_artists, list): artists.extend(group_artists) else: artists.append(group_artists) print return artists
class BasePlotType(object): colors = ["k", "b", "g", "r", "c", "m", "y"] color_index = 0 markers = ["o", "s", "<", "D"] marker_index = 0 sort_order = 0 plot_type_str = "base" default_group_by = None # set group_by in sub-classes to force a group_by as below # group_by = 'example' def __init__(self, args=None): self.args = args self.groups = OrderedDict() self.empty = True def accept_line(self, logline): """ return True if this PlotType can plot this line. """ return True def add_line(self, logline): """ append log line to this plot type. """ key = None self.empty = False self.groups.setdefault(key, list()).append(logline) @property def loglines(self): """ iterator yielding all loglines from groups dictionary. """ for key in self.groups: for logline in self.groups[key]: yield logline @classmethod def color_map(cls, group): color = cls.colors[cls.color_index] cls.color_index += 1 marker = cls.markers[cls.marker_index] if cls.color_index >= len(cls.colors): cls.marker_index += 1 cls.marker_index %= len(cls.markers) cls.color_index %= cls.color_index return color, marker def group(self): """ (re-)group all loglines by the given group. """ if hasattr(self, "group_by"): group_by = self.group_by else: group_by = self.default_group_by if self.args["group"] != None: group_by = self.args["group"] groups = OrderedDict() for logline in self.loglines: # if group_by is a function, call on logline if hasattr(group_by, "__call__"): key = group_by(logline) # if the logline has attribute of group_by, use that as key elif group_by and hasattr(logline, group_by): key = getattr(logline, group_by) # if the PlotType has a method with the name of group_by call that on logline elif group_by and hasattr(self, group_by): f = getattr(self, group_by) key = f(logline) # if a --label was given, use that as key elif self.args and self.args["label"]: key = self.args["label"] # else key is None else: key = None # special case: group together all connections if group_by == "thread" and key and key.startswith("conn"): key = "conn####" groups.setdefault(key, list()).append(logline) self.groups = groups def plot_group(self, group, idx, axis): raise NotImplementedError("BasePlotType can't plot. Use a derived class instead") def plot(self, axis, i): artists = [] print self.plot_type_str.upper(), "plot" print "%5s %9s %s" % ("id", " #points", "group") for idx, group in enumerate(self.groups): print "%5s %9s %s" % (idx + 1, len(self.groups[group]), group) group_artists = self.plot_group(group, idx + i, axis) if isinstance(group_artists, list): artists.extend(group_artists) else: artists.append(group_artists) print return artists
class Grouping(object): """Grouping object and related functions.""" def __init__(self, iterable=None, group_by=None): """Init object.""" self.groups = {} self.group_by = group_by if iterable: for item in iterable: self.add(item, group_by) def add(self, item, group_by=None): """General purpose class to group items by certain criteria.""" key = None if not group_by: group_by = self.group_by if group_by: # if group_by is a function, use it with item as argument if hasattr(group_by, '__call__'): key = group_by(item) # if the item has attribute of group_by as string, use that as key elif isinstance(group_by, str) and hasattr(item, group_by): key = getattr(item, group_by) else: key = None # try to match str(item) with regular expression if isinstance(group_by, str): match = re.search(group_by, str(item)) if match: if len(match.groups()) > 0: key = match.group(1) else: key = match.group() self.groups.setdefault(key, list()).append(item) def __getitem__(self, key): """Return item corresponding to key.""" return self.groups[key] def __iter__(self): """Iterate items in group.""" for key in self.groups: yield key def __len__(self): """Return length of group.""" return len(self.groups) def keys(self): """Return keys in group.""" return self.groups.keys() def values(self): """Return values in group.""" return self.groups.values() def items(self): """Return items in group.""" return self.groups.items() def regroup(self, group_by=None): """Regroup items.""" if not group_by: group_by = self.group_by groups = self.groups self.groups = {} for g in groups: for item in groups[g]: self.add(item, group_by) def move_items(self, from_group, to_group): """Take all elements from the from_group and add it to the to_group.""" if from_group not in self.keys() or len(self.groups[from_group]) == 0: return self.groups.setdefault(to_group, list()).extend(self.groups.get (from_group, list())) if from_group in self.groups: del self.groups[from_group] def sort_by_size(self, group_limit=None, discard_others=False, others_label='others'): """ Sort the groups by the number of elements they contain, descending. Also has option to limit the number of groups. If this option is chosen, the remaining elements are placed into another group with the name specified with others_label. if discard_others is True, the others group is removed instead. """ # sort groups by number of elements self.groups = OrderedDict(sorted(six.iteritems(self.groups), key=lambda x: len(x[1]), reverse=True)) # if group-limit is provided, combine remaining groups if group_limit is not None: # now group together all groups that did not make the limit if not discard_others: group_keys = self.groups.keys()[group_limit - 1:] self.groups.setdefault(others_label, list()) else: group_keys = self.groups.keys()[group_limit:] # only go to second last (-1), since the 'others' group is now last for g in group_keys: if not discard_others: self.groups[others_label].extend(self.groups[g]) del self.groups[g] # remove if empty if (others_label in self.groups and len(self.groups[others_label]) == 0): del self.groups[others_label] # remove others group regardless of limit if requested if discard_others and others_label in self.groups: del self.groups[others_label]
def group(self): """ (re-)group all loglines by the given group. """ if hasattr(self, 'group_by'): group_by = self.group_by else: group_by = self.default_group_by if self.args['group'] != None: group_by = self.args['group'] groups = OrderedDict() for logline in self.loglines: if self.args['optime_start']: self.xlabel = 'time (start of ops)' else: self.xlabel = 'time (end of ops)' # if group_by is a function, call on logline if hasattr(group_by, '__call__'): key = group_by(logline) # if the logline has attribute of group_by, use that as key elif group_by and hasattr(logline, group_by): key = getattr(logline, group_by) # if the PlotType has a method with the name of group_by call that on logline elif group_by and hasattr(self, group_by): f = getattr(self, group_by) key = f(logline) # if a --label was given, use that as key # elif self.args and self.args['label']: # key = self.args['label'] # else key is None else: key = None # try to match as regular expression if type(group_by) == types.StringType: match = re.search(group_by, logline.line_str) if match: if len(match.groups()) > 0: key = match.group(1) else: key = match.group() # special case: group together all connections # if group_by == "thread" and key and key.startswith("conn"): # key = "conn####" groups.setdefault(key, list()).append(logline) # sort groups by number of data points groups = OrderedDict( sorted(groups.iteritems(), key=lambda x: len(x[1]), reverse=True) ) # if --group-limit is provided, combine remaining groups if self.args['group_limit']: group_label = 'all others combined' # now group together all groups that did not make the limit groups[group_label] = [] # only go to second last (-1), since the 'other' group is now last for other_group in groups.keys()[ self.args['group_limit']:-1 ]: groups[group_label].extend(groups[other_group]) del groups[other_group] # remove if empty if len(groups[group_label]) == 0: del groups[group_label] self.groups = groups
class DateTimeFilter(BaseFilter): """ This filter has two parser arguments: --from and --to, both are optional. All possible values for --from and --to can be described as: [DATE] [TIME] [OFFSET] in that order, separated by a space. [DATE] can be any of - a 3-letter weekday (Mon, Tue, Wed, ...) - a date as 3-letter month, 1-2 digits day (Sep 5, Jan 31, Aug 08) - the words: today, now, start, end [TIME] can be any of - hours and minutes (20:15, 04:00, 3:00) - hours, minutes and seconds (13:30:01, 4:55:55) [OFFSET] consists of [OPERATOR][VALUE][UNIT] (no spaces in between) [OPERATOR] can be + or - (note that - can only be used if the whole "[DATE] [TIME] [OFFSET]" is in quotation marks, otherwise it would be confused with a separate parameter) [VALUE] can be any number [UNIT] can be any of s, sec, m, min, h, hours, d, days, w, weeks, mo, months, y, years The [OFFSET] is added/subtracted to/from the specified [DATE] [TIME]. For the --from parameter, the default is the same as 'start' (0001-01-01 00:00:00). If _only_ an [OFFSET] is given, it is added to 'start' (which is not very useful). For the --to parameter, the default is the same as 'end' (9999-31-12 23:59:59). If _only_ an [OFFSET] is given, however, it is added to [FROM]. Examples: --from Sun 10:00 goes from last Sunday 10:00:00am to the end of the file --from Sep 29 goes from Sep 29 00:00:00 to the end of the file --to today 15:00 goes from the beginning of the file to today at 15:00:00 --from today --to +1h goes from today's date 00:00:00 to today's date 01:00:00 --from 20:15 --to +3m goes from today's date at 20:15:00 to today's date at 20:18:00 """ filterArgs = [ ('--from', {'action':'store', 'type':custom_parse_dt, 'nargs':'*', 'default':'start', 'help':'output starting at FROM', 'dest':'from'}), ('--to', {'action':'store', 'type':custom_parse_dt, 'nargs':'*', 'default':'end', 'help':'output up to TO', 'dest':'to'}) ] timeunits = ['s', 'sec', 'm', 'min', 'h', 'hours', 'd', 'days', 'w', 'weeks', 'mo', 'months', 'y', 'years'] weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] dtRegexes = OrderedDict([ ('weekday', r'|'.join(weekdays)), # weekdays: see above ('date', '('+ '|'.join(months) +')' + r'\s+\d{1,2}'), # month + day: Jan 5, Oct 13, Sep 03, ... ('word', r'now|start|end|today'), ('time2', r'\d{1,2}:\d{2,2}'), # 11:59, 1:13, 00:00, ... ('time3', r'\d{1,2}:\d{2,2}:\d{2,2}'), # 11:59:00, 1:13:12, 00:00:59, ... ('offset', r'[\+-]\d+(' + '|'.join(timeunits) + ')'), # offsets: +3min, -20s, +7days, ... ]) def __init__(self, mlogfilter): BaseFilter.__init__(self, mlogfilter) self.fromReached = False self.toReached = False self.active = ('from' in self.mlogfilter.args and self.mlogfilter.args['from'] != 'start') or \ ('to' in self.mlogfilter.args and self.mlogfilter.args['to'] != 'end') def setup(self): """ get start end end date of logfile before starting to parse. """ if self.mlogfilter.is_stdin: # assume this year (we have no other info) now = datetime.now() self.startDateTime = datetime(now.year, 1, 1, tzinfo=tzutc()) self.endDateTime = datetime(MAXYEAR, 12, 31, tzinfo=tzutc()) else: logfiles = self.mlogfilter.args['logfile'] self.startDateTime = min([lf.start+timedelta(hours=self.mlogfilter.args['timezone'][i]) for i, lf in enumerate(logfiles)]) self.endDateTime = max([lf.end+timedelta(hours=self.mlogfilter.args['timezone'][i]) for i, lf in enumerate(logfiles)]) # now parse for further changes to from and to datetimes dtbound = DateTimeBoundaries(self.startDateTime, self.endDateTime) self.fromDateTime, self.toDateTime = dtbound(self.mlogfilter.args['from'] or None, self.mlogfilter.args['to'] or None) # define start_limit for mlogfilter's fast_forward method self.start_limit = self.fromDateTime # for single logfile, get file seek position of `to` datetime if len(self.mlogfilter.args['logfile']) == 1 and not self.mlogfilter.is_stdin: if self.mlogfilter.args['to'] != "end": # fast forward, get seek value, then reset file logfile = self.mlogfilter.args['logfile'][0] logfile.fast_forward(self.toDateTime) self.seek_to = logfile.filehandle.tell() logfile.filehandle.seek(0) else: self.seek_to = -1 else: self.seek_to = False def accept(self, logevent): if self.fromReached and self.seek_to: if self.seek_to != -1: self.toReached = self.mlogfilter.args['logfile'][0].filehandle.tell() >= self.seek_to return True else: # slow version has to check each datetime dt = logevent.datetime # if logevent has no datetime, accept if between --from and --to if dt == None: return self.fromReached if self.fromDateTime <= dt <= self.toDateTime: self.toReached = False self.fromReached = True return True elif dt > self.toDateTime: self.toReached = True return False else: return False def skipRemaining(self): return self.toReached
class DateTimeBoundaries(object): timeunits = ['secs', 'sec', 's', 'mins', 'min', 'm', 'months', 'month', 'mo', 'hours', 'hour', 'h', 'days', 'day', 'd', 'weeks','week', 'w', 'years', 'year', 'y'] weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] dtRegexes = OrderedDict([ # special constants ('constant', re.compile('(now|start|end|today|yesterday)' + '($|\s+)')), # weekday: Mon, Wed, Sat ('weekday', re.compile('(' + '|'.join(weekdays) + ')' + '($|\s+)')), # 11:59:00.123, 1:13:12.004 (also match timezone postfix like Z or +0700 or -05:30) # ('time', re.compile('(?P<hour>\d{1,2}):(?P<minute>\d{2,2})' + '(?::(?P<second>\d{2,2})(?:.(?P<microsecond>\d{3,3}))?)?(?P<timezone>[0-9Z:\+\-]+)?' + '($|\s+)')), # offsets: +3min, -20s, +7days (see timeunits above) ('offset', re.compile('(?P<operator>[\+-])(?P<value>\d+)(?P<unit>' + '|'.join(timeunits) +')'+'($|\s+)')) ]) def __init__(self, start, end): """ initialize the DateTimeBoundaries object with true start and end datetime objects. """ if start > end: raise ValueError('Error in DateTimeBoundaries: end cannot be before start datetime.') # make sure all datetimes are timezone-aware self.start = start if not self.start.tzinfo: self.start = self.start.replace(tzinfo=tzutc()) self.end = end if not self.end.tzinfo: self.end = self.end.replace(tzinfo=tzutc()) def string2dt(self, s, lower_bound=None): original_s = s result = {} dt = None # if s is completely empty, return start or end, depending on what parameter is evaluated if s == '': return self.end if lower_bound else self.start # first try to match the defined regexes for idx in self.dtRegexes: regex = self.dtRegexes[idx] mo = regex.search(s) # if match was found, cut it out of original string and store in result if mo: result[idx] = mo s = s[:mo.start(0)] + s[mo.end(0):] # handle constants if 'constant' in result: constant = result['constant'].group(0).strip() if constant == 'end': dt = self.end elif constant == 'start': dt = self.start elif constant == 'today': dt = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) elif constant == 'yesterday': dt = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1) elif constant == 'now': dt = datetime.now() elif 'weekday' in result: weekday = result['weekday'].group(0).strip() # assume most-recently occured weekday in logfile most_recent_date = self.end.replace(hour=0, minute=0, second=0, microsecond=0) offset = (most_recent_date.weekday() - self.weekdays.index(weekday)) % 7 dt = most_recent_date - timedelta(days=offset) # if anything remains unmatched, try parsing it with dateutil's parser if s.strip() != '': try: if dt: dt = parser.parse(s, default=dt, tzinfos=tzutc) else: # check if it's only time, then use the start dt as default, else just use the current year if re.match('(?P<hour>\d{1,2}):(?P<minute>\d{2,2})' + '(?::(?P<second>\d{2,2})(?:.(?P<microsecond>\d{3,3}))?)?(?P<timezone>[0-9Z:\+\-]+)?$', s): default = self.end if lower_bound else self.start else: default = datetime(self.end.year, 1, 1, 0, 0, 0) default = default.replace(second=0, microsecond=0) dt = parser.parse(s, default=default) except ValueError as e: raise ValueError("Error in DateTimeBoundaries: can't parse datetime from %s" % s) if not dt: dt = lower_bound or self.end # if no timezone specified, use the one from the logfile if dt.tzinfo == None: dt = dt.replace(tzinfo=self.start.tzinfo) # time is applied separately (not through the parser) so that string containing only time don't use today as default date (parser behavior) # if 'time' in result: # dct = dict( (k, int(v)) for k,v in result['time'].groupdict(0).iteritems() ) # dct['microsecond'] *= 1000 # dt = dt.replace(**dct) # apply offset if 'offset' in result: # separate in operator, value, unit dct = result['offset'].groupdict() mult = 1 if dct['unit'] in ['s', 'sec', 'secs']: dct['unit'] = 'seconds' elif dct['unit'] in ['m', 'min', 'mins']: dct['unit'] = 'minutes' elif dct['unit'] in ['h', 'hour', 'hours']: dct['unit'] = 'hours' elif dct['unit'] in ['d', 'day', 'days']: dct['unit'] = 'days' elif dct['unit'] in ['w', 'week', 'weeks']: dct['unit'] = 'days' mult = 7 elif dct['unit'] in ['mo', 'month', 'months']: dct['unit'] = 'days' mult = 30.43 elif dct['unit'] in ['y', 'year', 'years']: dct['unit'] = 'days' mult = 365.24 if dct['operator'] == '-': mult *= -1 dt = dt + eval('timedelta(%s=%i)'%(dct['unit'], mult*int(dct['value']))) # if parsed datetime is out of bounds and no year specified, try to adjust year year_present = re.search('\d{4,4}', original_s) if not year_present and not 'constant' in result: if dt < self.start and dt.replace(year=dt.year+1) >= self.start and dt.replace(year=dt.year+1) <= self.end: dt = dt.replace(year=dt.year+1) elif dt > self.end and dt.replace(year=dt.year-1) >= self.start and dt.replace(year=dt.year-1) <= self.end: dt = dt.replace(year=dt.year-1) return dt def __call__(self, from_str=None, to_str=None): """ sets the boundaries based on `from` and `to` strings. """ from_dt = self.string2dt(from_str, lower_bound=None) to_dt = self.string2dt(to_str, lower_bound=from_dt) if to_dt < from_dt: raise ValueError('Error in DateTimeBoundaries: lower bound is greater than upper bound.') # limit from and to at the real boundaries if to_dt > self.end: to_dt = self.end if from_dt < self.start: from_dt = self.start return from_dt, to_dt
def run(self): """Run this section and print out information.""" grouping = Grouping( group_by=lambda x: (x.namespace, x.operation, x.pattern)) logfile = self.mloginfo.logfile if logfile.start and logfile.end: progress_start = self.mloginfo._datetime_to_epoch(logfile.start) progress_total = (self.mloginfo._datetime_to_epoch(logfile.end) - progress_start) else: self.mloginfo.progress_bar_enabled = False for i, le in enumerate(logfile): # update progress bar every 1000 lines if self.mloginfo.progress_bar_enabled and (i % 1000 == 0): if le.datetime: progress_curr = self.mloginfo._datetime_to_epoch( le.datetime) if progress_total: (self.mloginfo.update_progress( float(progress_curr - progress_start) / progress_total)) if (le.operation in ['query', 'getmore', 'update', 'remove'] or le.command in ['count', 'findandmodify', 'geonear', 'find']): lt = LogTuple(namespace=le.namespace, operation=op_or_cmd(le), pattern=le.pattern, duration=le.duration) grouping.add(lt) grouping.sort_by_size() # clear progress bar again if self.mloginfo.progress_bar_enabled: self.mloginfo.update_progress(1.0) # no queries in the log file if len(grouping) < 1: print('no queries found.') return titles = [ 'namespace', 'operation', 'pattern', 'count', 'min (ms)', 'max (ms)', 'mean (ms)', '95%-ile (ms)', 'sum (ms)' ] table_rows = [] for g in grouping: # calculate statistics for this group namespace, op, pattern = g group_events = [ le.duration for le in grouping[g] if le.duration is not None ] stats = OrderedDict() stats['namespace'] = namespace stats['operation'] = op stats['pattern'] = pattern stats['count'] = len(group_events) stats['min'] = min(group_events) if group_events else '-' stats['max'] = max(group_events) if group_events else '-' stats['mean'] = 0 if np: stats['95%'] = (np.percentile(group_events, 95) if group_events else '-') else: stats['95%'] = 'n/a' stats['sum'] = sum(group_events) if group_events else '-' stats['mean'] = (stats['sum'] / stats['count'] if group_events else '-') if self.mloginfo.args['verbose']: stats['example'] = grouping[g][0] titles.append('example') table_rows.append(stats) # sort order depending on field names reverse = True if self.mloginfo.args['sort'] in ['namespace', 'pattern']: reverse = False table_rows = sorted(table_rows, key=itemgetter(self.mloginfo.args['sort']), reverse=reverse) print_table(table_rows, titles, uppercase_headers=False) print('')