def handle(self, fn, retry, **options): result = parse_logs( os.path.join(get_setting('ROOT'), '%(key)s', fn), progress=progress_bar, retry=retry, ) process_results(result, progress=progress_bar)
def parse_file(key, log, result=None, protect=True, retry=False, **kwargs): """ Parse a log file with the given parser (key) and return a dictionary of paths, by a dictionary of dates, with a dictionary of metrics that may be a list of all values parsed. """ logs = get_setting('LOGS') if key not in logs: raise ValueError("Log parser '%s' not found in LOGBOOK_PARSERS" % key) logs = logs[key] if result is None: result = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) if not os.path.isfile(log): raise IOError("No %s log to process: %s" % (key, log)) inode = os.stat(log)[1] if not retry and LogFile.objects.filter(inode=inode).count() > 0: raise IOError("Log file already processed: %s" % log) for kwargs in matches_in(key, log, logs['rex'], **kwargs): add_result(result, **run(kwargs, *logs.get('ignore', ()))) if protect: LogFile.objects.get_or_create( inode=inode, defaults={'filename': os.path.basename(log)}) return result
def url(path): """Seperate out language and standardise url""" qs = '' local = False path = urllib.unquote(path) if '?' in path: (path, qs) = path.split('?', 1) if '://' in path: # For refers, we want to shorten them down to local urls. server = path.split('://')[-1].split('/')[0] if server in settings.ALLOWED_HOSTS: path = '/' + path.split('/', 3)[-1] local = True if path.startswith('/'): path = path.lstrip('/') if '/' in path: (lang, rest) = path.split('/', 1) lang = lang.replace('zh-tw', 'zh-hant') if lang in get_setting('LANGUAGES'): return (True, lang, rest.strip('/'), qs) return (local, None, path.strip('/'), qs)
def get_search(url): for m in get_setting('SEARCHES'): res = re.match(m, url) if res: return ("search://" + res.groupdict()['site'], res.groupdict().get('q', 'unknown').lower()) return (url.split('?', 1)[0], None)
def process_results(result, progress=None): """Each result is either an average or a count""" LogMetric.objects.clear_metrics() count = 0 done = 0 total = float(len(result)) for count, path in enumerate(result): if progress: progress("save ", count / total, count, done) try: (request, _) = LogRequest.objects.get_or_create(path=path) except utils.IntegrityError: continue except utils.OperationalError: continue for d_ate, data in result[path].items(): (period, _) = LogPeriod.objects.get_or_create(period=0, date=d_ate, request_id=request.pk) for key, value in data.items(): unit = get_setting('UNITS').get(key, None) metric = LogMetric.objects.get_metric(key, unit) period.values.create_or_update(metric, value) done += 1 if progress: progress("save ", 1.0, count, done)
def parse_logs(location, **kwargs): """ Loop through all defined log types and parse results. """ result = None for key in get_setting('LOGS'): src = location % {'key': key} result = parse_file(key, src, result, **kwargs) return result
def add_result(result, path, **data): """Add a single result to the results matrix""" if path is None or not 'date' in data: return # Index by request path and date request = result[path][data.get('date')] sitewide = result[None][data.pop('date')] ignore = get_setting('IGNORE') pathfields = get_setting('PATH_FIELDS') # Record all columns regardless for key, value in data.items(): if key in ignore: continue if key in pathfields: request[key].append(value) sitewide[key].append(value)
def agent_filter(items): """Filter user agent items to be more useful""" filters = get_setting('FILTERS', {}) for (kind, family, version) in items: version = agent_version(version) variable = "%s__%s" % (family, version) for (rex, replace) in filters.get(kind, []): variable = re.sub(rex, replace, variable, flags=re.IGNORECASE) if variable == '': break elif '__' not in variable: raise KeyError("Filter %s not family__version pair." % str(rex)) if variable != '': yield (kind, tuple(variable.split('__', 1)))
def run(data, *junk): """Attempt to format various keys and remove the junk""" data['M'] = [ '', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ].index(data['M']) dtm = datetime(*[int(data.pop(k)) for k in ['Y', 'M', 'D', 'h', 'm', 's']]) data['date'] = dtm.date() #data['time'] = time(dtm.time().hour, 0, 0) (local, data['lang'], data['path'], _) = url(data['path']) for m in get_setting('EXCLUSIONS'): if re.match(m, data['path']): return {'path': None} (local, _, data['refer'], query) = url(data.get('refer', '-')) if local: data['link'] = data.pop('refer') else: (data['refer'], data['search']) = get_search(data['refer'] + '?' + query) data['count'] = data['path'] data['country'] = country(data.pop('ip')) data.update(dict(get_agent(data.pop('agent', None)))) if data['status'] != '200': junk += ('count', 'country', 'agent') for key in junk: data.pop(key, None) for key, value in data.items(): if value is None or value == '-': data.pop(key) # Usually the refer can be long if type(value) is str and len(value) > 255: data[key] = value[:255] if type(value) is tuple: if len(value) != 2: raise ValueError("Family, name pair error: %s" % str(value)) data[key] = (value[0][:128], value[1][:255]) return data
def test_override(self): self.assertEqual(get_setting('TEST'), 'Foo')
def test_regex(self): self.assertIn('rex', get_setting('LOGS')['nginx'])
def test_default_setting(self): self.assertIn('size', get_setting('UNITS'))