def parse_log_access_entries(file):
    """Faz o parser de um LOG no formato Apache e retorna uma lista
    de recursos.

    Também calcula o tempo de atraso com que o recurso deve ser acessado.
    """

    # https://httpd.apache.org/docs/1.3/mod/mod_log_config.html#formats
    parser = LogParser(
        '%h %l %u %t "%m %U %H" %>s %b "%{Referer}i" "%{User-Agent}i"')

    requests = []
    start_time = None
    entries = list(parser.parse_lines(file))

    if entries is not None and len(entries) > 0:
        start_time = entries[0].request_time

    for entry in entries:
        requests.append({
            "path":
            URL_BASE_REGEX.sub("/", entry.request_uri),
            "method":
            entry.request_method,
            "delay": (entry.request_time - start_time).total_seconds(),
            "entry":
            entry.entry,
        })

    return requests
Esempio n. 2
0
def test_parse_general(end):
    ENTRY = '209.126.136.4 - - [01/Nov/2017:07:28:29 +0000] "GET / HTTP/1.1" 301 521 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"'
    parser = LogParser(COMBINED, encoding="utf-8")
    assert parser.format == COMBINED
    parsed = parser.parse(ENTRY + end)
    assert parsed.remote_host == "209.126.136.4"
    assert parsed.remote_logname is None
    assert parsed.remote_user is None
    assert parsed.request_time == datetime(2017,
                                           11,
                                           1,
                                           7,
                                           28,
                                           29,
                                           tzinfo=timezone.utc)
    assert parsed.request_line == "GET / HTTP/1.1"
    assert parsed.final_status == 301
    assert parsed.bytes_sent == 521
    assert parsed.headers_in == {
        "Referer":
        None,
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
    }
    assert (
        parsed.headers_in["User-Agent"] == parsed.headers_in["USER-AGENT"] ==
        parsed.headers_in["user-agent"] ==
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
    )
    assert parsed.entry == ENTRY
    assert parsed.format == COMBINED
    assert parsed.request_time_fields == {
        "timestamp": datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc)
    }
    assert parsed.directives == {
        "%h":
        "209.126.136.4",
        "%l":
        None,
        "%u":
        None,
        "%t":
        datetime(2017, 11, 1, 7, 28, 29, tzinfo=timezone.utc),
        "%r":
        "GET / HTTP/1.1",
        "%>s":
        301,
        "%b":
        521,
        "%{Referer}i":
        None,
        "%{User-Agent}i":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
    }
def test_malformed_time_directive(fmt):
    with pytest.raises(InvalidDirectiveError) as excinfo:
        LogParser('%{' + fmt + '}t')
    assert str(excinfo.value) \
        == 'Invalid log format directive at index 0 of {!r}'.format(fmt)
    assert excinfo.value.pos == 0
    assert excinfo.value.format == fmt
Esempio n. 4
0
def test_parse_latin1():
    log_entry = LogParser(COMBINED).parse(ENTRY)
    for k, v in NON_STR_FIELDS.items():
        assert getattr(log_entry, k) == v
    assert log_entry.request_line == log_entry.directives["%r"] == "Gh0st\xAD"
    assert log_entry.remote_host == log_entry.directives["%h"] \
        == "66.240.205.34"
Esempio n. 5
0
def test_malformed_time_directive(fmt):
    with pytest.raises(InvalidDirectiveError) as excinfo:
        LogParser("%{" + fmt + "}t")
    assert str(
        excinfo.value) == f"Invalid log format directive at index 0 of {fmt!r}"
    assert excinfo.value.pos == 0
    assert excinfo.value.format == fmt
Esempio n. 6
0
def test_parse_utf8_surrogateescape():
    log_entry = LogParser(COMBINED, encoding='utf-8', errors='surrogateescape')\
                    .parse(ENTRY)
    for k, v in NON_STR_FIELDS.items():
        assert getattr(log_entry, k) == v
    assert log_entry.request_line == log_entry.directives["%r"] == "Gh0st\uDCAD"
    assert log_entry.remote_host == log_entry.directives["%h"] \
        == "66.240.205.34"
Esempio n. 7
0
def test_parse_custom_german_time(fmt, entry, fields):
    oldlocale = locale.setlocale(locale.LC_ALL)
    try:
        locale.setlocale(locale.LC_ALL, "de_DE.utf8")
    except locale.Error:
        pytest.skip("Locale not supported")
    else:
        entry = LogParser(fmt).parse(entry)
        for k, v in fields.items():
            assert getattr(entry, k) == v
    finally:
        locale.setlocale(locale.LC_ALL, oldlocale)
Esempio n. 8
0
class ApacheLogParser:
    """
    Класс ответственный за парсинг данных и сохранение в БД
    """
    def __init__(self):
        self.parser = LogParser(
            '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{Data}i\"'
        )

    def parse_all(self, file_path: str):
        t1 = time.time()
        logger.info('Starting saving file to db')
        with open(file_path, 'r') as f:
            next(f)  # первая строка - пустая
            bulk_mng = BulkCreateManager()
            for entry in self.parser.parse_lines(f, ignore_invalid=True):
                ip_address = entry.remote_host
                timestamp = entry.request_time
                response_status_code = entry.final_status
                http_method, request_path, http_protocol = entry.request_line.split(
                )
                content_length = entry.bytes_sent
                user_agent = entry.headers_in['User-Agent']
                referer = entry.headers_in['Referer']
                bulk_mng.add(
                    Log(ip_address=ip_address,
                        timestamp=timestamp,
                        response_status_code=response_status_code,
                        http_method=http_method,
                        request_path=request_path,
                        http_protocol=http_protocol,
                        content_length=content_length,
                        user_agent=user_agent,
                        referer=referer))
            bulk_mng.done()  # удостовериться что все данные добавились в бд
            logger.info(f'It took {time.time() - t1} seconds')
Esempio n. 9
0
 def __init__(self):
     # Pattern for parsing Apache log entries
     self.parser = LogParser(
         '%h %l %u %{[%d/%b/%Y:%H:%M:%S %z]}t "%r" %>s %b "%{Referer}i" "%{User-agent}i"'
     )
Esempio n. 10
0
class LogProcessor:
    def __init__(self):
        # Pattern for parsing Apache log entries
        self.parser = LogParser(
            '%h %l %u %{[%d/%b/%Y:%H:%M:%S %z]}t "%r" %>s %b "%{Referer}i" "%{User-agent}i"'
        )

    def populate_logs(self, link: str = None):
        """Download Apache logs using the supplied link, then parse them and load into the database."""
        if link:
            response = requests.get(link)
            logger.info(f"Status code is {response.status_code}")
            if response.status_code != 200:
                return None

            lines = response.text.splitlines()

            with tqdm(total=len(lines), leave=False) as progress_bar:
                progress_bar.set_description(f"Processing Apache log entries")

                batch_size = 64
                objs = []

                # Parse log lines and insert batches to the database.
                for line in lines:
                    if line != "\n" and line != "":
                        log_item = self.parse_log_line(line)
                        objs.append(ApacheLogs(**log_item._asdict()))

                        if len(objs) == batch_size:
                            ApacheLogs.objects.bulk_create(objs, batch_size)
                            progress_bar.update(n=batch_size)
                            objs.clear()

                ApacheLogs.objects.bulk_create(objs, len(objs))
                progress_bar.update(n=len(objs))

    def parse_log_line(self, line: str) -> LogItem:
        """Parse the log line."""
        # Removing '-' from the end of the line before parsing
        line = line[:-4]

        entry = self.parser.parse(line)
        parsed_log_line = LogProcessor.convert_to_logitem(entry)
        logger.debug(f"parsed log line: {parsed_log_line}")

        return parsed_log_line

    @staticmethod
    def convert_to_logitem(entry) -> LogItem:
        """Convert LogParser entry into LogItem."""
        def sub(x: Optional):
            return "-" if x is None else x

        http_method = sub(entry.request_line.split()[0])
        uri = sub(entry.request_line.split()[1])
        response_code = sub(entry.final_status)
        response_size = sub(entry.bytes_sent)
        referer = sub(entry.headers_in["Referer"])
        user_agent = sub(entry.headers_in["User-Agent"])

        dt = entry.request_time_fields
        log_date = datetime(
            year=int(dt["year"]),
            month=int(datetime.strptime(dt["abbrev_mon"], "%b").month),
            day=int(dt["mday"]),
            hour=int(dt["hour"]),
            minute=int(dt["min"]),
            second=int(dt["sec"]),
            tzinfo=dt["timezone"],
        )

        log_line = LogItem(
            ip_address=entry.remote_host,
            log_date=log_date,
            HTTP_method=http_method,
            URI=uri,
            response_code=response_code,
            response_size=response_size,
            referer=referer,
            user_agent=user_agent,
        )
        return log_line
Esempio n. 11
0
def test_parse_custom_time(fmt, entry, fields):
    log_entry = LogParser(fmt, encoding="utf-8").parse(entry)
    for k, v in fields.items():
        assert getattr(log_entry, k) == v
Esempio n. 12
0
def test_parse(fmt, entry, fields):
    log_entry = LogParser(fmt).parse(entry)
    assert log_entry.entry == entry.rstrip("\r\n")
    assert log_entry.format == fmt
    for k, v in fields.items():
        assert getattr(log_entry, k) == v
Esempio n. 13
0
from sys import getsizeof

import requests
from apachelogs import LogEntry, LogParser
from django.core.management.base import BaseCommand, CommandError
from tqdm import tqdm

from logs.models import Log

#  Init a parser with log format string: http://httpd.apache.org/docs/current/mod/mod_log_config.html
log_parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%l\"")


class Command(BaseCommand):
    """
    Custom management command:
    python manage.py loadlog <url:str> (-s, --size <int>)
    """
    help = 'Download Apache log file and insert its events to database.\n' \
           'Pass a -s or --size argument with value in Megabytes to set size limit for downloading a file'

    def add_arguments(self, parser):
        # positional arguments
        parser.add_argument(
            'log_url',
            nargs='?',
            type=str,
        )

        # named (optional) arguments
        parser.add_argument(
Esempio n. 14
0
from apachelogs import LogParser        #Firstly Install pip install apachelogs

status_code_dict = {}

parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"")


with open("C://Users//H P//Downloads//access_log") as f:         #File location where the access_log file is present
    for line in f:
        entry = parser.parse(line)
        status_code = entry.final_status

        if status_code not in status_code_dict:
            status_code_dict[status_code] = 1
        else:
            status_code_dict[status_code] += 1

for i in status_code_dict:
     print(i , ":", status_code_dict[i])
Esempio n. 15
0
  def process_files(self, user_arguments):

    prev_host    = ""
    log_entries  = []
    codes        = []
    countries    = []

    # Log format as defined in Apache/HTTPD configuration file (LogFormat directive) or manually by user
    if user_arguments.log_format:
      log_format = user_arguments.log_format
    else:
      log_format = self.get_httpd_logformat_directive(user_arguments.httpd_conf_file, user_arguments.httpd_log_nickname)

    parser = LogParser(log_format)

    if user_arguments.codes:
      codes = self.get_input_status_codes(self.populate_status_codes(), user_arguments.codes)

    if user_arguments.countries:
      countries = user_arguments.countries

    date_lower = user_arguments.date_lower
    date_upper = user_arguments.date_upper
    day_format = "%d-%m-%Y"

    if date_lower is not None:
      date_lower = datetime.strptime(date_lower, day_format)
    if date_upper is not None:
      date_upper = datetime.strptime(date_upper, day_format)

    files = self.get_files(user_arguments.files_regex, user_arguments.files_list)

    show_progress   = user_arguments.show_progress
    use_geolocation = user_arguments.use_geolocation

    geotool_exec          = user_arguments.geotool_exec
    geo_database_location = user_arguments.geo_database_location

    incl_fields = user_arguments.incl_fields
    if isinstance(user_arguments.incl_fields, str):
      incl_fields = user_arguments.incl_fields.replace(' ','').split(',')

    fields = self.get_included_fields(
      self.get_out_fields(),
      incl_fields,
      user_arguments.excl_fields
    )

    if use_geolocation:
      fields['country']['included'] = True
      fields['city']['included']    = True

    if fields['country']['included'] or fields['city']['included']:
      use_geolocation = True

    invalid_lines        = []
    field_names          = []
    i                    = 0
    country_seen         = False
    geo_data             = None
    skip_line_by_status  = False
    skip_line_by_country = False

    lines_total = sum([i['lines'] for i in self.get_file_line_count(files)])

    if show_progress:
      print(
        "File count: {}\nLines in total: {}".format(
          str(len(files)),
          str(lines_total)
        ))

    for lfile in files:

      if show_progress:
        print("Processing file: {} (lines: {})".format(
          lfile,
          str(self.get_file_line_count([lfile])[0]['lines'])
        ))

      with open(lfile, 'r') as f:

        for line in f:

          if show_progress:
            print("Processing log entry: {} ({}%)".format(
              str(i),
              round(100 * (i/lines_total), 2)
            ), end = "\r")

          if i != 0 and not (skip_line_by_status or skip_line_by_country) and entry_data:
            prev_host      = entry_data['remote_host']
            prev_host_time = entry_data['time']

          try:
            entry = parser.parse(line)
          except InvalidEntryError:
            invalid_lines.append((lfile, i + 1))
            continue

          entry_data = {
            'time':         entry.request_time.replace(tzinfo = None),
            'user_agent':   entry.headers_in["User-Agent"],
            'http_request': str(entry.request_line).encode('unicode_escape').decode(),
            'remote_host':  entry.remote_host,
            'status':       entry.final_status
          }

          if not self.date_checker(date_lower, date_upper, entry_data['time']):
            i += 1
            continue

          if len(codes) > 0:
             skip_line_by_status = self.filter_status_code(codes, entry_data['status'])

          if use_geolocation:
            if prev_host == entry_data['remote_host']:
                country_seen = True
            else:
              country_seen = False

            if not country_seen:
              geo_data = self.geotool_get_data(geotool_exec, geo_database_location, entry_data['remote_host'])

            if len(countries) > 0 and geo_data is not None:
              skip_line_by_country = self.filter_country(countries, geo_data['host_country'])

          else:
            skip_line_by_country = False

          if skip_line_by_status or skip_line_by_country:
            i += 1
            continue

          time_diff = str('NEW_CONN')
          if prev_host == entry_data['remote_host']:
            time_diff = (entry_data['time'] - prev_host_time).total_seconds()
            if isinstance(time_diff, float):
              time_diff = int(time_diff)
            if time_diff > 0:
              time_diff = "+" + str(time_diff)
          if i == 0:
            time_diff = int(0)

          if fields['log_file_name']['included']:
            fields['log_file_name']['data'] = lfile
          if fields['http_status']['included']:
            fields['http_status']['data'] = entry_data['status']
          if fields['remote_host']['included']:
            fields['remote_host']['data'] = entry_data['remote_host']

          if geo_data is not None:
            if fields['country']['included']:
              fields['country']['data'] = geo_data['host_country']
            if fields['city']['included']:
              fields['city']['data'] = geo_data['host_city']

          if fields['time']['included']:
            fields['time']['data'] = entry_data['time']
          if fields['time_diff']['included']:
            fields['time_diff']['data'] = time_diff
          if fields['user_agent']['included']:
            fields['user_agent']['data'] = entry_data['user_agent']
          if fields['http_request']['included']:
            fields['http_request']['data'] = entry_data['http_request']

          stri = ""
          printargs = []

          for key, value in fields.items():
            if not use_geolocation and (key == 'country' or key == 'city'):
              continue
            if value['included']:
              stri += "\t" + value['format']
              printargs.append(value['data'])

              if not any(key in i for i in field_names):
                field_names.append((key, value['human_name']))

          log_entries.append(printargs)
          i += 1

    return [log_entries, files, i, stri, field_names, invalid_lines]
Esempio n. 16
0
 def __init__(self):
     self.parser = LogParser(
         '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{Data}i\"'
     )
Esempio n. 17
0
def test_parse_ip_address(encoding):
    assert LogParser('%a', encoding=encoding).parse('127.0.0.1').remote_address\
        == "127.0.0.1"
Esempio n. 18
0
def test_parse_bad_utf8():
    with pytest.raises(UnicodeDecodeError):
        LogParser(COMBINED, encoding='utf-8').parse(ENTRY)
def test_unknown_directive(fmt):
    with pytest.raises(UnknownDirectiveError) as excinfo:
        LogParser(fmt)
    assert str(excinfo.value) \
        == 'Unknown log format directive: {!r}'.format(fmt)
    assert excinfo.value.directive == fmt
Esempio n. 20
0
def test_bytes_parse():
    log_entry = LogParser(COMBINED, encoding="bytes").parse(ENTRY)
    for k, v in NON_STR_FIELDS.items():
        assert getattr(log_entry, k) == v
    assert log_entry.request_line == log_entry.directives["%r"] == b"Gh0st\xAD"
    assert log_entry.remote_host == log_entry.directives["%h"] == b"66.240.205.34"
Esempio n. 21
0
def test_parse_ip_address(encoding):
    assert (
        LogParser("%a", encoding=encoding).parse("127.0.0.1").remote_address
        == "127.0.0.1"
    )