def parse_log_access_entries(file): """Faz o parser de um LOG no formato Apache e retorna uma lista de recursos. Também calcula o tempo de atraso com que o recurso deve ser acessado. """ # https://httpd.apache.org/docs/1.3/mod/mod_log_config.html#formats parser = LogParser( '%h %l %u %t "%m %U %H" %>s %b "%{Referer}i" "%{User-Agent}i"') requests = [] start_time = None entries = list(parser.parse_lines(file)) if entries is not None and len(entries) > 0: start_time = entries[0].request_time for entry in entries: requests.append({ "path": URL_BASE_REGEX.sub("/", entry.request_uri), "method": entry.request_method, "delay": (entry.request_time - start_time).total_seconds(), "entry": entry.entry, }) return requests
def test_parse_general(end): ENTRY = '209.126.136.4 - - [01/Nov/2017:07:28:29 +0000] "GET / HTTP/1.1" 301 521 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"' parser = LogParser(COMBINED, encoding="utf-8") assert parser.format == COMBINED parsed = parser.parse(ENTRY + end) assert parsed.remote_host == "209.126.136.4" assert parsed.remote_logname is None assert parsed.remote_user is None assert parsed.request_time == datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc) assert parsed.request_line == "GET / HTTP/1.1" assert parsed.final_status == 301 assert parsed.bytes_sent == 521 assert parsed.headers_in == { "Referer": None, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", } assert ( parsed.headers_in["User-Agent"] == parsed.headers_in["USER-AGENT"] == parsed.headers_in["user-agent"] == "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" ) assert parsed.entry == ENTRY assert parsed.format == COMBINED assert parsed.request_time_fields == { "timestamp": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc) } assert parsed.directives == { "%h": "209.126.136.4", "%l": None, "%u": None, "%t": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc), "%r": "GET / HTTP/1.1", "%>s": 301, "%b": 521, "%{Referer}i": None, "%{User-Agent}i": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", }
def test_malformed_time_directive(fmt): with pytest.raises(InvalidDirectiveError) as excinfo: LogParser('%{' + fmt + '}t') assert str(excinfo.value) \ == 'Invalid log format directive at index 0 of {!r}'.format(fmt) assert excinfo.value.pos == 0 assert excinfo.value.format == fmt
def test_parse_latin1(): log_entry = LogParser(COMBINED).parse(ENTRY) for k, v in NON_STR_FIELDS.items(): assert getattr(log_entry, k) == v assert log_entry.request_line == log_entry.directives["%r"] == "Gh0st\xAD" assert log_entry.remote_host == log_entry.directives["%h"] \ == "66.240.205.34"
def test_malformed_time_directive(fmt): with pytest.raises(InvalidDirectiveError) as excinfo: LogParser("%{" + fmt + "}t") assert str( excinfo.value) == f"Invalid log format directive at index 0 of {fmt!r}" assert excinfo.value.pos == 0 assert excinfo.value.format == fmt
def test_parse_utf8_surrogateescape(): log_entry = LogParser(COMBINED, encoding='utf-8', errors='surrogateescape')\ .parse(ENTRY) for k, v in NON_STR_FIELDS.items(): assert getattr(log_entry, k) == v assert log_entry.request_line == log_entry.directives["%r"] == "Gh0st\uDCAD" assert log_entry.remote_host == log_entry.directives["%h"] \ == "66.240.205.34"
def test_parse_custom_german_time(fmt, entry, fields): oldlocale = locale.setlocale(locale.LC_ALL) try: locale.setlocale(locale.LC_ALL, "de_DE.utf8") except locale.Error: pytest.skip("Locale not supported") else: entry = LogParser(fmt).parse(entry) for k, v in fields.items(): assert getattr(entry, k) == v finally: locale.setlocale(locale.LC_ALL, oldlocale)
class ApacheLogParser: """ Класс ответственный за парсинг данных и сохранение в БД """ def __init__(self): self.parser = LogParser( '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{Data}i\"' ) def parse_all(self, file_path: str): t1 = time.time() logger.info('Starting saving file to db') with open(file_path, 'r') as f: next(f) # первая строка - пустая bulk_mng = BulkCreateManager() for entry in self.parser.parse_lines(f, ignore_invalid=True): ip_address = entry.remote_host timestamp = entry.request_time response_status_code = entry.final_status http_method, request_path, http_protocol = entry.request_line.split( ) content_length = entry.bytes_sent user_agent = entry.headers_in['User-Agent'] referer = entry.headers_in['Referer'] bulk_mng.add( Log(ip_address=ip_address, timestamp=timestamp, response_status_code=response_status_code, http_method=http_method, request_path=request_path, http_protocol=http_protocol, content_length=content_length, user_agent=user_agent, referer=referer)) bulk_mng.done() # удостовериться что все данные добавились в бд logger.info(f'It took {time.time() - t1} seconds')
def __init__(self): # Pattern for parsing Apache log entries self.parser = LogParser( '%h %l %u %{[%d/%b/%Y:%H:%M:%S %z]}t "%r" %>s %b "%{Referer}i" "%{User-agent}i"' )
class LogProcessor: def __init__(self): # Pattern for parsing Apache log entries self.parser = LogParser( '%h %l %u %{[%d/%b/%Y:%H:%M:%S %z]}t "%r" %>s %b "%{Referer}i" "%{User-agent}i"' ) def populate_logs(self, link: str = None): """Download Apache logs using the supplied link, then parse them and load into the database.""" if link: response = requests.get(link) logger.info(f"Status code is {response.status_code}") if response.status_code != 200: return None lines = response.text.splitlines() with tqdm(total=len(lines), leave=False) as progress_bar: progress_bar.set_description(f"Processing Apache log entries") batch_size = 64 objs = [] # Parse log lines and insert batches to the database. for line in lines: if line != "\n" and line != "": log_item = self.parse_log_line(line) objs.append(ApacheLogs(**log_item._asdict())) if len(objs) == batch_size: ApacheLogs.objects.bulk_create(objs, batch_size) progress_bar.update(n=batch_size) objs.clear() ApacheLogs.objects.bulk_create(objs, len(objs)) progress_bar.update(n=len(objs)) def parse_log_line(self, line: str) -> LogItem: """Parse the log line.""" # Removing '-' from the end of the line before parsing line = line[:-4] entry = self.parser.parse(line) parsed_log_line = LogProcessor.convert_to_logitem(entry) logger.debug(f"parsed log line: {parsed_log_line}") return parsed_log_line @staticmethod def convert_to_logitem(entry) -> LogItem: """Convert LogParser entry into LogItem.""" def sub(x: Optional): return "-" if x is None else x http_method = sub(entry.request_line.split()[0]) uri = sub(entry.request_line.split()[1]) response_code = sub(entry.final_status) response_size = sub(entry.bytes_sent) referer = sub(entry.headers_in["Referer"]) user_agent = sub(entry.headers_in["User-Agent"]) dt = entry.request_time_fields log_date = datetime( year=int(dt["year"]), month=int(datetime.strptime(dt["abbrev_mon"], "%b").month), day=int(dt["mday"]), hour=int(dt["hour"]), minute=int(dt["min"]), second=int(dt["sec"]), tzinfo=dt["timezone"], ) log_line = LogItem( ip_address=entry.remote_host, log_date=log_date, HTTP_method=http_method, URI=uri, response_code=response_code, response_size=response_size, referer=referer, user_agent=user_agent, ) return log_line
def test_parse_custom_time(fmt, entry, fields): log_entry = LogParser(fmt, encoding="utf-8").parse(entry) for k, v in fields.items(): assert getattr(log_entry, k) == v
def test_parse(fmt, entry, fields): log_entry = LogParser(fmt).parse(entry) assert log_entry.entry == entry.rstrip("\r\n") assert log_entry.format == fmt for k, v in fields.items(): assert getattr(log_entry, k) == v
from sys import getsizeof import requests from apachelogs import LogEntry, LogParser from django.core.management.base import BaseCommand, CommandError from tqdm import tqdm from logs.models import Log # Init a parser with log format string: http://httpd.apache.org/docs/current/mod/mod_log_config.html log_parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%l\"") class Command(BaseCommand): """ Custom management command: python manage.py loadlog <url:str> (-s, --size <int>) """ help = 'Download Apache log file and insert its events to database.\n' \ 'Pass a -s or --size argument with value in Megabytes to set size limit for downloading a file' def add_arguments(self, parser): # positional arguments parser.add_argument( 'log_url', nargs='?', type=str, ) # named (optional) arguments parser.add_argument(
from apachelogs import LogParser #Firstly Install pip install apachelogs status_code_dict = {} parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"") with open("C://Users//H P//Downloads//access_log") as f: #File location where the access_log file is present for line in f: entry = parser.parse(line) status_code = entry.final_status if status_code not in status_code_dict: status_code_dict[status_code] = 1 else: status_code_dict[status_code] += 1 for i in status_code_dict: print(i , ":", status_code_dict[i])
def process_files(self, user_arguments): prev_host = "" log_entries = [] codes = [] countries = [] # Log format as defined in Apache/HTTPD configuration file (LogFormat directive) or manually by user if user_arguments.log_format: log_format = user_arguments.log_format else: log_format = self.get_httpd_logformat_directive(user_arguments.httpd_conf_file, user_arguments.httpd_log_nickname) parser = LogParser(log_format) if user_arguments.codes: codes = self.get_input_status_codes(self.populate_status_codes(), user_arguments.codes) if user_arguments.countries: countries = user_arguments.countries date_lower = user_arguments.date_lower date_upper = user_arguments.date_upper day_format = "%d-%m-%Y" if date_lower is not None: date_lower = datetime.strptime(date_lower, day_format) if date_upper is not None: date_upper = datetime.strptime(date_upper, day_format) files = self.get_files(user_arguments.files_regex, user_arguments.files_list) show_progress = user_arguments.show_progress use_geolocation = user_arguments.use_geolocation geotool_exec = user_arguments.geotool_exec geo_database_location = user_arguments.geo_database_location incl_fields = user_arguments.incl_fields if isinstance(user_arguments.incl_fields, str): incl_fields = user_arguments.incl_fields.replace(' ','').split(',') fields = self.get_included_fields( self.get_out_fields(), incl_fields, user_arguments.excl_fields ) if use_geolocation: fields['country']['included'] = True fields['city']['included'] = True if fields['country']['included'] or fields['city']['included']: use_geolocation = True invalid_lines = [] field_names = [] i = 0 country_seen = False geo_data = None skip_line_by_status = False skip_line_by_country = False lines_total = sum([i['lines'] for i in self.get_file_line_count(files)]) if show_progress: print( "File count: {}\nLines in total: {}".format( str(len(files)), str(lines_total) )) for lfile in files: if show_progress: print("Processing file: {} (lines: {})".format( lfile, str(self.get_file_line_count([lfile])[0]['lines']) )) with open(lfile, 'r') as f: for line in f: if show_progress: print("Processing log entry: {} ({}%)".format( str(i), round(100 * (i/lines_total), 2) ), end = "\r") if i != 0 and not (skip_line_by_status or skip_line_by_country) and entry_data: prev_host = entry_data['remote_host'] prev_host_time = entry_data['time'] try: entry = parser.parse(line) except InvalidEntryError: invalid_lines.append((lfile, i + 1)) continue entry_data = { 'time': entry.request_time.replace(tzinfo = None), 'user_agent': entry.headers_in["User-Agent"], 'http_request': str(entry.request_line).encode('unicode_escape').decode(), 'remote_host': entry.remote_host, 'status': entry.final_status } if not self.date_checker(date_lower, date_upper, entry_data['time']): i += 1 continue if len(codes) > 0: skip_line_by_status = self.filter_status_code(codes, entry_data['status']) if use_geolocation: if prev_host == entry_data['remote_host']: country_seen = True else: country_seen = False if not country_seen: geo_data = self.geotool_get_data(geotool_exec, geo_database_location, entry_data['remote_host']) if len(countries) > 0 and geo_data is not None: skip_line_by_country = self.filter_country(countries, geo_data['host_country']) else: skip_line_by_country = False if skip_line_by_status or skip_line_by_country: i += 1 continue time_diff = str('NEW_CONN') if prev_host == entry_data['remote_host']: time_diff = (entry_data['time'] - prev_host_time).total_seconds() if isinstance(time_diff, float): time_diff = int(time_diff) if time_diff > 0: time_diff = "+" + str(time_diff) if i == 0: time_diff = int(0) if fields['log_file_name']['included']: fields['log_file_name']['data'] = lfile if fields['http_status']['included']: fields['http_status']['data'] = entry_data['status'] if fields['remote_host']['included']: fields['remote_host']['data'] = entry_data['remote_host'] if geo_data is not None: if fields['country']['included']: fields['country']['data'] = geo_data['host_country'] if fields['city']['included']: fields['city']['data'] = geo_data['host_city'] if fields['time']['included']: fields['time']['data'] = entry_data['time'] if fields['time_diff']['included']: fields['time_diff']['data'] = time_diff if fields['user_agent']['included']: fields['user_agent']['data'] = entry_data['user_agent'] if fields['http_request']['included']: fields['http_request']['data'] = entry_data['http_request'] stri = "" printargs = [] for key, value in fields.items(): if not use_geolocation and (key == 'country' or key == 'city'): continue if value['included']: stri += "\t" + value['format'] printargs.append(value['data']) if not any(key in i for i in field_names): field_names.append((key, value['human_name'])) log_entries.append(printargs) i += 1 return [log_entries, files, i, stri, field_names, invalid_lines]
def __init__(self): self.parser = LogParser( '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{Data}i\"' )
def test_parse_ip_address(encoding): assert LogParser('%a', encoding=encoding).parse('127.0.0.1').remote_address\ == "127.0.0.1"
def test_parse_bad_utf8(): with pytest.raises(UnicodeDecodeError): LogParser(COMBINED, encoding='utf-8').parse(ENTRY)
def test_unknown_directive(fmt): with pytest.raises(UnknownDirectiveError) as excinfo: LogParser(fmt) assert str(excinfo.value) \ == 'Unknown log format directive: {!r}'.format(fmt) assert excinfo.value.directive == fmt
def test_bytes_parse(): log_entry = LogParser(COMBINED, encoding="bytes").parse(ENTRY) for k, v in NON_STR_FIELDS.items(): assert getattr(log_entry, k) == v assert log_entry.request_line == log_entry.directives["%r"] == b"Gh0st\xAD" assert log_entry.remote_host == log_entry.directives["%h"] == b"66.240.205.34"
def test_parse_ip_address(encoding): assert ( LogParser("%a", encoding=encoding).parse("127.0.0.1").remote_address == "127.0.0.1" )