Exemple #1
0
def parse_apachelog(filename):
    print "Parsing file {}".format(filename)
    headers = r'\"%{X-Forwarded-For}i\" %h \"%{bg_node}e\" %{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t %P \"%r\" %>s %a %b %I %D %f %O \"%{Referer}i\" \"%{User-Agent}i\" %{bg_xact_id}n %{bg_user_id}n %{bg_app_id}n \"%{bg_device_type}n\" %{bg_usage_type}n %{bg_severity}n \"%{bg_email_to}n\" \"%{bg_email_from}n\" \"%{bg_email_cc}n\" \"%{bg_email_subject}n\" \"%{bg_email_date}n\" %{bg_file_name}n %{bg_file_ext}n %{bg_keywords}n %{bg_latitude}n %{bg_longitude}n %{bg_accuracy}n \"%{bg_mime_type}n\" \"%{bg_page_title}n\" \"%{bg_field1}n\" \"%{bg_field2}n\" \"%{bg_field3}n\" \"%{bg_field4}n\" \"%{bg_field5}n\" \"%{bg_field6}n\" \"%{bg_field7}n\" \"%{bg_field8}n\" \"%{bg_field9}n\" \"%{bg_field10}n\"'
    p = apachelog.parser(headers)
    log_list = []
    with open(filename) as f:
        for line in f.readlines():
            line = line.replace('    ', ' - - - ')
            # line = line.replace(' - -', '')
            timestamp = re.search(
                r'([0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}T[0-9]{2,2}:[0-9]{2,2}:[0-9]{2,2}.[0-9]{1,6}\+[0-9]{1,4})',
                line)
            try:
                new_timestamp = '[' + timestamp.group(0) + ']'
            except AttributeError:
                print(
                    "Line '{0}' doesn't contain a timestamp. Skipping the line"
                    .format(line))
                continue
            line = line.replace(timestamp.group(0), new_timestamp)
            # print ("Parsing line {0}".format(line))
            try:
                data = p.parse(line)
            except apachelog.ApacheLogParserError:
                sys.stderr.write("Unable to parse: %s" % line)
            data['%{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t'] = data[
                '%{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t'][1:11] + ' ' + data[
                    '%{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t'][12:27]  # + \
            #                                                     ' ' + data['%{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t'][
            #                                                           27:32]
            log_list.append(data)

    print "Length of currnet parsed list '{}' is: {}".format(
        filename, len(log_list))
    return log_list
def run(inLog, dbFile, website, havijParser, compareToGood, knownGood, cookie, logFormat=""):
    apacheParser = apachelog.parser(logFormat)
    inLogFd = open(inLog)
    lineCounter = 1

    con = sqlite3.connect(dbFile)
    con.row_factory = sqlite3.Row
    con.text_factory = str
    cur = con.cursor()
    cur.execute("select * from sqlite_master")
    dropTable = False
    for row in cur.fetchall():
        if row["name"] == "sqlInjectedReturns":
            dropTable = True
    if dropTable:
        cur.execute("DROP TABLE sqlInjectedReturns")
    cur.execute(
        "CREATE TABLE IF NOT EXISTS sqlInjectedReturns(id INTEGER PRIMARY KEY AUTOINCREMENT,\
                                                               request TEXT,\
                                                               returnVal TEXT)"
    )

    for line in inLogFd:
        try:
            lineData = apacheParser.parse(line)
        except:
            print "Could not parse data on line %s, error: %s" % (line, sys.exc_info())
            lineCounter += 1
            continue
        urlToGet = website + lineData["%r"].split(" ")[1]
        urlHeaders = {}
        if "%{User-Agent}i" in lineData:
            urlHeaders["User-Agent"] = lineData["%{User-Agent}i"]

        if cookie:
            urlHeaders["Cookie"] = cookie

        try:
            urlRequest = urllib2.Request(urlToGet, None, urlHeaders)
            urlGetter = urllib2.urlopen(urlRequest)
            urlData = urlGetter.read()
            cur.execute("INSERT INTO sqlInjectedReturns(request, returnVal) Values (?,?)", [lineData["%r"], urlData])
        except:
            print "Could not get data for url %s" % (urlToGet)
        lineCounter += 1
        if lineCounter % 100 == 0:
            print "Parsed %s lines" % (lineCounter)

    con.commit()
    inLogFd.close()

    if havijParser:
        print "Parsing Havij attack"
        havijParse(cur, con)

    if knownGood:
        print "Comparing to known good"
        compareSqliToGood(cur, con, knownGood)

    con.close()
def readAndParse(filenames):
    p = apachelog.parser(apachelog.formats['extended'])
    for filename in filenames:

        if os.path.splitext(filename)[1] == '.gz':
            fp = gzip.open(filename, 'rb')
        else:
            fp = open(filename)


        for line in fp:
            if not bitstreamRE.search(line):
                # if no bitstream ID, don't go any further
                continue
            try:
                data = p.parse(line)
            except apachelog.ApacheLogParserError:
                print >> sys.stderr, "failed to parse", line,
                continue

            yield dict(requestor=data['%h'],
                       bytes=data['%b'],
                       request=data['%r'],
                       time=data['%t'].strip('[]'),
                       user=data['%u'],
                       result=data['%>s'],
                       useragent=data['%{User-agent}i'],
                       referer=data['%{Referer}i'])
Exemple #4
0
    def _parse_file(self, storage_key, file_path, read_from_start=False, read_to_time=None):
        """
        Read recent part of the log file, update statistics storages and adjust seek.
        If only file parameter is supplied, read file from self.seek to the end.
        If the file is not found or cannot be read, log an error and return.

        @param str storage_key: a key to define statistics storage
        @param string file_path: path to file for parsing
        @param bool read_from_start: if true, read from the beginning of file, otherwise from `self.seek`
        @param datetime read_to_time: if set, records are parsed until their time is greater or equal of parameter value
        Otherwise the file is read till the end.
        """

        with open(file_path, 'r') as f:
            if read_from_start:
                logger.debug('Reading file %s from the beginning to %s'
                             % (file_path, read_to_time))
            else:
                logger.debug('Reading file %s from position %d to %s'
                             % (file_path, self.seek[file_path], read_to_time or 'the end'))

            if not read_from_start:
                f.seek(self.seek[file_path])
                logger.debug('Setting seek for file %s to %d based on a value from the storage'
                             % (f.name, self.seek[file_path]))

            log_parser = apachelog.parser(getattr(settings, 'ELF_FORMAT', ''))

            while True:
                current_seek = f.tell()
                line = f.readline()

                if not line:
                    #Reached end of file, record seek and stop
                    self.seek[file_path] = current_seek
                    logger.debug('Reached end of file %s, set seek in storage to %d' % (f.name, current_seek))
                    break

                record = utils.parse_line(line, log_parser, getattr(settings, 'LATENCY_IN_MILLISECONDS', False))

                if not record:
                    self._count_record(storage_key, 'error')
                    continue

                record_time = record.get_time()
                if record_time is None:
                    logger.error('Could not process time string: ' + record.time)
                    logger.error('Line: ' + record.line)
                    self._count_record(storage_key, 'error')
                    continue

                if read_to_time and record_time >= read_to_time:
                    #Reached a record with timestamp higher than end of current analysis period
                    #Stop here and leave it for the next invocation.
                    self.seek[file_path] = current_seek
                    logger.debug('Reached end of period, set seek for %s in storage to %d' % (f.name, current_seek))
                    break

                status = self._process_record(storage_key, record)
                self._count_record(storage_key, status)
Exemple #5
0
    def test_empty_response_code(self, monkeypatch):
        utils_setup(monkeypatch)

        line = u'172.19.0.40 - - [08/Aug/2013:10:59:59 +0200] "POST /content/csl/contentupdate/xxx HTTP/1.1" "" 8563 ' \
               u'"-" "Apache-HttpClient/4.2.1 (java 1.5)" community1 community1 OK 14987 8785 53047'
        parser = apachelog.parser(settings.ELF_FORMAT)
        record = parse_line(line, parser)
        assert record is None
Exemple #6
0
    def test_empty(self, monkeypatch):
        utils_setup(monkeypatch)

        line = u''
        parser = apachelog.parser(settings.ELF_FORMAT)
        record = parse_line(line, parser)

        assert record is None
Exemple #7
0
    def test_wrong_request(self, monkeypatch):
        utils_setup(monkeypatch)

        line = u'172.19.0.40 - - [08/Aug/2013:10:59:59 +0200] "\x80w\x01\x03\x01" 200 8563 "-" ' \
               u'"Apache-HttpClient/4.2.1 (java 1.5)" community1 community1 OK 14987 8785 53047'
        parser = apachelog.parser(settings.ELF_FORMAT)
        record = parse_line(line, parser)

        assert record is None
Exemple #8
0
    def test_with_latency_in_microseconds(self, monkeypatch):
        utils_setup(monkeypatch)

        line = u'172.19.0.40 - - [08/Aug/2013:10:59:59 +0200] "POST /content/csl/contentupdate/xxx HTTP/1.1" 200 8563 '\
               u'"-" "Apache-HttpClient/4.2.1 (java 1.5)" community1 community1 OK 14987 8785 1253'
        parser = apachelog.parser(settings.ELF_FORMAT)
        record = parse_line(line, parser, False)

        assert record.latency == 1
Exemple #9
0
    def __init__(self, collection=None, counter_compliant=False):
        self._parser = apachelog.parser(APACHE_LOG_FORMAT)
        allowed_collections = self._allowed_collections()

        if collection not in allowed_collections:
            raise ValueError('Invalid collection id ({0}), you must select one of these {1}'.format(collection, str(allowed_collections)))

        self.collection = collection
        self.acronym_to_issn_dict = self._acronym_to_issn_dict()
        self.allowed_issns = self._allowed_issns(self.acronym_to_issn_dict)
Exemple #10
0
    def analyze(self):
        """
        options include '--totals-only'
        """

        requests_number = 0  # number of requests for a current second
        previous_timestamp = ''  # timestamp from a previous line
        #TODO: add logic to deal with possible not continuous equal timestamps

        for line in self.logfile:
            try:
                p = apachelog.parser(apachelog.formats[self.format])
                data = p.parse(line)
            except:
                sys.stderr.write("Unable to parse line: %s" % line)
                raise  #TODO: extend options (--ignore-bad-lines)

            # getting the timestamp
            try:
                timestamp = data.get('%t')
            except:
                sys.stderr.write("Unable to get data: %s" % data)
                raise  #TODO: extend options (--ignore-parse-errors)

            if timestamp == previous_timestamp:
                requests_number += 1
            else:
                if (self.options != '--totals-only') and (previous_timestamp !=
                                                          ''):
                    # print summary for the previous second
                    s = "%s = %s" % (previous_timestamp, requests_number)
                    print s

                self.secondsSum += 1
                self.requestsSum += requests_number

                if requests_number > self.maxRequestsPerSec:
                    self.maxRequestsPerSec = requests_number

                if requests_number < self.minRequestsPerSec:
                    self.minRequestsPerSec = requests_number

                if self.minRequestsPerSec == 0:
                    self.minRequestsPerSec = requests_number

                requests_number = 1
                previous_timestamp = timestamp

        self.logfile.close

        self.avgRequestsPerSec = float(self.requestsSum) / self.secondsSum

        return
Exemple #11
0
    def test_valid(self, monkeypatch):
        utils_setup(monkeypatch)

        line = u'172.19.0.40 - - [08/Aug/2013:10:59:59 +0200] "POST /data/csl/contentupdate/xxx HTTP/1.1" 200 8563 '\
               u'"-" "Apache-HttpClient/4.2.1 (java 1.5)" community1 community1 OK 14987 8785 53047'
        parser = apachelog.parser(settings.ELF_FORMAT)
        record = parse_line(line, parser)

        assert record.raw_request == '/data/csl/contentupdate/xxx'
        assert record.get_time() == datetime.datetime.strptime('20130808105959', log_record.APACHELOG_DATETIME_FORMAT)
        assert record.response_code == 200
        assert record.latency == 53
        assert record.get_method_id() == 'csl_contentupdate'
Exemple #12
0
def apache_2_sql ( logfile ):
	"Convert apache logfile to mysql table"
	
	p = apachelog.parser(apachelog.formats['extended'])
	for line in open( logfile ):
	 try:
	  data = p.parse(line)
	 except:
		sys.stderr.write("Unable to parse %s" % line)
	 converted_date = apachelog.parse_date(data['%t'])
	 converted_request = data['%r'].lower().strip()

	 print data
	return
Exemple #13
0
def apache_2_sql(logfile):
    "Convert apache logfile to mysql table"

    p = apachelog.parser(apachelog.formats['extended'])
    for line in open(logfile):
        try:
            data = p.parse(line)
        except:
            sys.stderr.write("Unable to parse %s" % line)
        converted_date = apachelog.parse_date(data['%t'])
        converted_request = data['%r'].lower().strip()

        print data
    return
Exemple #14
0
    def __init__(self, log_file_location, log_format, period, alert_period, alert_threshold, max_frequent_sections):
        '''
        Constructor
        '''
        self._log_file_location = log_file_location
        self._parser = apachelog.parser(apachelog.formats[log_format])
        self._period = period
        self._alert_period = alert_period
        self._alert_threshold = alert_threshold
        self._max_frequent_sections = max_frequent_sections

        self._log_file_position = 0
        self._log_cache = []
        self._alert_state = False
        self._alerts = []
Exemple #15
0
    def __init__(self, log_file_location, log_format, period, alert_period,
                 alert_threshold, max_frequent_sections):
        '''
        Constructor
        '''
        self._log_file_location = log_file_location
        self._parser = apachelog.parser(apachelog.formats[log_format])
        self._period = period
        self._alert_period = alert_period
        self._alert_threshold = alert_threshold
        self._max_frequent_sections = max_frequent_sections

        self._log_file_position = 0
        self._log_cache = []
        self._alert_state = False
        self._alerts = []
    def __init__(self,
                 collection=None,
                 counter_compliant=False,
                 allowed_collections=_allowed_collections,
                 acronym_to_issn_dict=_acronym_to_issn_dict):
        self._parser = apachelog.parser(APACHE_LOG_FORMAT)
        allowed_collections = allowed_collections()

        if collection not in allowed_collections:
            raise ValueError(
                'Invalid collection id ({0}), you must select one of these {1}'
                .format(collection, str(allowed_collections)))

        self.collection = collection
        self.acronym_to_issn_dict = acronym_to_issn_dict(self.collection)
        self.allowed_issns = self._allowed_issns(self.acronym_to_issn_dict)
Exemple #17
0
def create_urls(logfile, outfile, logformat, grep=None):
    parser = apachelog.parser(logformat)

    with open(logfile) as f, open(outfile, 'w') as o:
        writer = csv.writer(o)

        # Status spinner
        spinner = "|/-\\"
        pos = 0

        for i, line in enumerate(f):
            # Spin the spinner
            if i % 10000 == 0:
                sys.stdout.write("\r" + spinner[pos])
                sys.stdout.flush()
                pos += 1
                pos %= len(spinner)

            # If a filter was specified, filter by it
            if grep and not grep in line:
                continue

            try:
                data = parser.parse(line)
            except apachelog.ApacheLogParserError as e:
                print(e)
                continue

            if data[STATUS_CODE] != '200':
                continue

            method, url, protocol = data[REQUEST].split()

            # Check for GET requests with a status of 200
            if method != 'GET':
                continue

            # Exclude media requests and special urls
            if MEDIA_RE.search(url) or SPECIAL_RE.search(url):
                continue

            # This is a good record that we want to write
            writer.writerow([url, data[USER_AGENT]])

        print(' done!')
Exemple #18
0
def get_seek(file_path, period_start):
    """
    Given a file path, find a position in it where the records for a tracked period start.
    @param str file_path: path to log file to seek
    @param datetime period_start: timestamp for the beginning of the tracked period
    @return int seek
    """
    f = open(file_path, 'r')

    log_parser = apachelog.parser(getattr(settings, 'ELF_FORMAT', ''))
    size = os.stat(file_path).st_size
    logger.debug('Running get_seek() for file %s' % f.name)
    approximate_seek = _find_approximate_seek_before_period_by_moving_back(f, size, log_parser, period_start)
    logger.debug('approximate seek for %s is set to %d' % (f.name, approximate_seek))
    exact_seek = _find_exact_seek_before_period_by_moving_forward(f, log_parser, approximate_seek, period_start)
    logger.debug('exact seek for %s is set to %d' % (f.name, exact_seek))
    f.close()
    return exact_seek
Exemple #19
0
def create_urls(logfile, outfile, logformat, grep=None):
    parser = apachelog.parser(logformat)

    with open(logfile) as f, open(outfile, 'wb') as o:
        writer = csv.writer(o)

        # Status spinner
        spinner = "|/-\\"
        pos = 0

        for i, line in enumerate(f):
            # Spin the spinner
            if i % 10000 == 0:
                sys.stdout.write("\r" + spinner[pos])
                sys.stdout.flush()
                pos += 1
                pos %= len(spinner)

            # If a filter was specified, filter by it
            if grep and not grep in line:
                continue

            try:
                data = parser.parse(line)
            except apachelog.ApacheLogParserError:
                continue
            try:
                method, url, protocol = data[REQUEST].split()
            except ValueError:
                #print "Line %d: Unable to split" % i
                continue

            # Check for GET requests with a status of 200
            if method != 'GET' or data[STATUS_CODE] != '200':
                continue

            # Exclude media requests and special urls
            if MEDIA_RE.search(url) or SPECIAL_RE.search(url):
                continue

            # This is a good record that we want to write
            writer.writerow([url, data[USER_AGENT]])

        print ' done!'
Exemple #20
0
    def __init__(self):
        self.format = r'%{owner}i %{bucket}i %{datetime}t %{ip}h %{requester}i %{requestid}i %{operation}i %{key}i \"%{requesturi}i\" %{status}s %{error}i %{bytes}b %{objectsize}i %{totaltime}i %{turnaround}i \"%{referer}i\" \"%{useragent}i\" %{versionid}i'
        self.parser = apachelog.parser(self.format)

        self.log_mapper = {
            #'%{owner}i':('owner',None),
            '%{datetime}t':('logged_time',lambda dtstr: datetime.strptime(dtstr[:-6].strip('[]'),'%d/%b/%Y:%H:%M:%S ')),
            '%{ip}h':('ip',None),
            '%{requestid}i':('request_id',None),
            '%{operation}i':('operation',None),
            '%{key}i':('key',None),
            '%{requesturi}i':('uri',None),
            '%{requestid}i':('_id',None),
            '%{status}s':('status',int),
            '%{error}i':('error',None),
            '%{bytes}b':('bytes',int),
            '%{versionid}i':('version_id',None),
            '%{objectsize}i':('object_size',int)
        }
Exemple #21
0
def processFile(path):
    logparser = alog.parser(alog.formats['common'])

    hosts = list()
    dates = list()
    statuses = list()
    respsizes = list()

    with open(path, "r+b") as log:
        map = None
        try:
            map = mmap.mmap(log.fileno(), 0, access=mmap.ACCESS_READ)
            for line in iter(map.readline, ""):
                try:
                    data = logparser.parse(line)
                    hosts.append(data['%h'])
                    cdate = dateutil.parser.parse(data['%t'].strip('[]'),
                                                  fuzzy=True)
                    dates.append(cdate)
                    statuses.append(data['%>s'])
                    respsizes.append(data['%b'])
                except:
                    print "Unable to parse %s" % line

        finally:
            if map:
                map.close()

    index = pd.to_datetime(dates)

    frame = pd.DataFrame(
        {
            'Host': pd.Series(hosts, index=index, dtype=str),
            'Status': pd.Series(statuses, index=index),
            'ResponseSize': pd.Series(respsizes, index=index)
        },
        index=index)

    frame.to_hdf('processed.hdf', 'stat')

    return frame
Exemple #22
0
    def _whatFormat(self):  #TODO: currently it doesn't work
        """
        Verifies if a given file complies with any standard access log file format
        and returns the name of this format or the UnknownFormatError exception.
        """

        try:
            line = self.logfile.readline()
        except:
            raise ('Cannot read the first line of the file')

        formats = 'extended', 'common', 'vhcommon'
        for f in formats:
            try:
                p = apachelog.parser(apachelog.formats[f])
                data = p.parse(line)
            except:
                next
            else:
                return f
        raise UnknownFormatError('Unknown log file format')
Exemple #23
0
def processFile(path):
    logparser = alog.parser(alog.formats['common'])

    hosts = list()
    dates = list()
    statuses = list()
    respsizes = list()

    with open(path, "r+b") as log:
        map = None
        try:
            map = mmap.mmap(log.fileno(), 0, access=mmap.ACCESS_READ)
            for line in iter(map.readline, ""):
                try:
                    data = logparser.parse(line)
                    hosts.append(data['%h'])
                    cdate = dateutil.parser.parse(
                        data['%t'].strip('[]'),
                        fuzzy=True)
                    dates.append(cdate)
                    statuses.append(data['%>s'])
                    respsizes.append(data['%b'])
                except:
                    print "Unable to parse %s" % line

        finally:
            if map:
                map.close()

    index = pd.to_datetime(dates)

    frame = pd.DataFrame({
        'Host': pd.Series(hosts, index=index, dtype=str),
        'Status': pd.Series(statuses, index=index),
        'ResponseSize': pd.Series(respsizes, index=index)},
        index=index)

    frame.to_hdf('processed.hdf', 'stat')

    return frame
Exemple #24
0
def main():
  if rank == 0:
    from optparse import OptionParser
    
    parser = OptionParser(usage = 'Usage: mpiexec -n 4 python apachelogstats.py --(addr|time) /var/log/apache2/access.log.1')
    
    parser.add_option("-t", "--time",
    action="store_const", dest="mode", const = '%t',
    help="group by time")
    parser.add_option("-a", "--addr",
    action="store_const", dest="mode", const = '%h',
    help="group by address")
    
    (options, args) = parser.parse_args()
    
    # Format copied and pasted from Apache conf - use raw string + single quotes
    format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'

    p = apachelog.parser(format)
    data = []

    for i,line in enumerate(open(args[0])):
      try:
        data.append( p.parse( line ) )
      except:
        sys.stderr.write("Unable to parse %s" % line)
      if i > 6: break
  else:
    data = None
    options = None

  options = comm.bcast(options, root=0)
  
  ranked_data = MR_map(data, options.mode)
  
  if rank == 0:
    dd = MR_reduce(ranked_data)
    
    for key, value in dd.iteritems():
      print( '%s %d' % ( str(key), len(value) ) )
Exemple #25
0
"""
log_format tjmain '[$time_local] $status $host '
                      '$upstream_addr $upstream_response_time $request_time '
                      '$remote_addr $remote_user $request '
                      '$bytes_sent "$http_referer" '
                      '"$http_user_agent" "$http_x_forwarded_for"'
[17/Jan/2012:17:00:55 +0800] 304 10.2.76.28 - - 0.000 10.2.76.25 - GET /highcharts/js/modules/exporting.js HTTP/1.1 158 "http://10.2.76.28/highcharts/examples/bar-basic.htm" "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7" "-"sendfileon

format = r'%t %>s %h %uh %urt %addr %ru \"%r\" %b \"%{Referer}i\" \"%{User-Agent}i\" \"{Forward}\"'

#default
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"{Forward}\"'
"""

format = r'%t %>s %h %ua %ub %uc %v %vu %m %mu %mv %b \"%{Referer}i\" \"%{User-Agent}i\" \"{Forward}\"'
parser = apachelog.parser(format)

def run(f):
    count_line, count_failed = 0, 0
    for line in f:
        if not line:
            continue
        try:
            count_line += 1
            data = parser.parse(line)
            data = [v for k,v in data]
        except apachelog.ApacheLogParserError:
            #logging.exception('pipeline parse')
            count_failed += 1
            continue
Exemple #26
0
 def __init__(self, sink):
     super(CommonLogFormatParser, self).__init__(sink)
     self.apachelog_parser = apachelog.parser(self.format)
Exemple #27
0
        entry = {
            "_id" : ip,
            "ban_time" : datetime.now(),
            "printed" : "no" # "printed" flag is cleared
        }
        banned_ip_collection.update(spec={"_id" : ip}, document=entry, upsert=True)

def is_ip_banned(ip):
    if not banned_ip_collection:
        return False
    return banned_ip_collection.find_one({"_id" : ip}) is not None

window = None # list of slots, shifting every SLOT_INTERVAL milliseconds
millis = None # last window shift, milliseconds from epoch

logline_parser = parser(ACCESS_LOG_RECORD_FORMAT) # see apachelog docs for details 

def logrecord(logline):
    """Parse log line
       Returns values as tuple: (date, time, ip, url, agent, referrer,)
    """

    (date, time, ip, url, agent, referrer, code) = (None, None, None, None, None, None, None)

    try:
        if logline.count('"') == 8: # remove the misterious last "-"
            logline = logline[:len(logline) - 3]
        parsed = logline_parser.parse(logline)

        code = parsed["%>s"]
        ip = parsed["%h"]
Exemple #28
0
"""

import sys
import os
import apachelog
import optparse
import urlparse
import cgi
import logging
import pdb

from datetime import datetime, time, timedelta

# Format copied and pasted from Apache conf - use raw string + single quotes
apache_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
apache_parser = apachelog.parser(apache_format)

logging.basicConfig(level=logging.INFO)

def main():
    usage = """ %prog WAV_FILENAME LOG_FILENAME [options] """
    opt_parser = optparse.OptionParser(usage=usage)
    
    opt_parser.add_option("-d", "--delays", dest="delays_filename", help="An optional argument to"
                          " specify a TSV file which contains the delays for each stage.")
    opt_parser.add_option("-c", "--classes", dest="classes_filename", help="An optional argument to specify a TSV file "
                          "which contains classes for each stage.")
    opt_parser.add_option("-s", "--svm", action='store_true', default=False, dest="svm_output", help="Flag to also generate "
                            "an input file suitable for input into Max's pitch, format and energy analyzer. Requires a classes file.")
    opt_parser.add_option("-r", "--ratings", action='store_true', default=False,
                            dest="use_url_ratings", help="Parses the 'rating' URL parameter"
Exemple #29
0
        }
        banned_ip_collection.update(spec={"_id": ip},
                                    document=entry,
                                    upsert=True)


def is_ip_banned(ip):
    if not banned_ip_collection:
        return False
    return banned_ip_collection.find_one({"_id": ip}) is not None


window = None  # list of slots, shifting every SLOT_INTERVAL milliseconds
millis = None  # last window shift, milliseconds from epoch

logline_parser = parser(
    ACCESS_LOG_RECORD_FORMAT)  # see apachelog docs for details


def logrecord(logline):
    """Parse log line
       Returns values as tuple: (date, time, ip, url, agent, referrer,)
    """

    (date, time, ip, url, agent, referrer, code) = (None, None, None, None,
                                                    None, None, None)

    try:
        if logline.count('"') == 8:  # remove the misterious last "-"
            logline = logline[:len(logline) - 3]
        parsed = logline_parser.parse(logline)
Exemple #30
0
#!/usr/bin/python

import sys, os, apachelog, bandwidth, MySQLdb

p = apachelog.parser(
    r"%v %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"")
bwsum = bandwidth.Bandwidth()

for file in sys.argv[1:]:
    print "Processing %s file" % file
    for line in open(file):
        try:
            data = p.parse(line.strip())

            if (data['%O'] == '-'):
                continue

            bwsum.add(data['%v'], data['%O'], data['%t'])
        except:
            sys.stderr.write("Unable to parse %s" % line)

connection = MySQLdb.connect(host='localhost',
                             user='******',
                             passwd='',
                             db='store')
bwsum.persist(connection)
Exemple #31
0
# limitations under the License.
#
import apachelog
from log_utils import parse_apache_line
import os
import sys

# %b - Size
# %h - Remote IP or Host
# %l - Remote Log Name
# %r - Request
# %>s - HTTP Status Code
# %t - eventTime
# %u - Remote User
# %{Referer}i - Referer
# %{User-agent}i - UserAgent

if len(sys.argv) == 2:

    log_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}\i"'
    parser = apachelog.parser(log_format)

    for line in open(sys.argv[1]):
        p = parse_apache_line(parser, line.strip())
        print(
            "host: {0}, time: {1}, request: {2}, status: {3}, size: {4}, referer: {5}, agent: {6}"
            .format(p['%h'], p['%t'], p['%r'], p['%>s'], p['%b'],
                    p['%{Referer}i'], p['%{User-Agent}\\i"']))
else:
    sys.stderr.write("usage: {0} <path>".format(os.path.basename(sys.argv[0])))
Exemple #32
0
import json
import apachelog
import hashlib



def writeRecordsToFileName (records, fileName):
	fullFileName = fileName + '.json'
	jsonFile = open(fullFileName, 'w')
	json.dump({'docs':records}, jsonFile)
	jsonFile.close()
	print "wrote file " + fullFileName
	
 
if len(sys.argv) > 1:
	parser = apachelog.parser(apachelog.formats['extended'])
	fileNumber = 1

	for fileName in sys.argv[1:]:
		records = []
		
		file = open(fileName)
		for line in file:
			try:
				data = parser.parse(line)
				record = {}
				record['date'] = data['%t'][1:-1]
				record['iptrunk'] = '.'.join(data['%h'].split('.')[0:2])
				hash = hashlib.md5()
				hash.update('smsmc0' + data['%h'])
				record['iphash'] = hash.hexdigest()
Exemple #33
0
def run(inLog,
        dbFile,
        website,
        havijParser,
        compareToGood,
        knownGood,
        cookie,
        logFormat=""):
    apacheParser = apachelog.parser(logFormat)
    inLogFd = open(inLog)
    lineCounter = 1

    con = sqlite3.connect(dbFile)
    con.row_factory = sqlite3.Row
    con.text_factory = str
    cur = con.cursor()
    cur.execute("select * from sqlite_master")
    dropTable = False
    for row in cur.fetchall():
        if row['name'] == 'sqlInjectedReturns':
            dropTable = True
    if dropTable:
        cur.execute("DROP TABLE sqlInjectedReturns")
    cur.execute(
        "CREATE TABLE IF NOT EXISTS sqlInjectedReturns(id INTEGER PRIMARY KEY AUTOINCREMENT,\
                                                               request TEXT,\
                                                               returnVal TEXT)"
    )

    for line in inLogFd:
        try:
            lineData = apacheParser.parse(line)
        except:
            print "Could not parse data on line %s, error: %s" % (
                line, sys.exc_info())
            lineCounter += 1
            continue
        urlToGet = website + lineData['%r'].split(" ")[1]
        urlHeaders = {}
        if '%{User-Agent}i' in lineData:
            urlHeaders['User-Agent'] = lineData['%{User-Agent}i']

        if cookie:
            urlHeaders['Cookie'] = cookie

        try:
            urlRequest = urllib2.Request(urlToGet, None, urlHeaders)
            urlGetter = urllib2.urlopen(urlRequest)
            urlData = urlGetter.read()
            cur.execute(
                "INSERT INTO sqlInjectedReturns(request, returnVal) Values (?,?)",
                [lineData['%r'], urlData])
        except:
            print "Could not get data for url %s" % (urlToGet)
        lineCounter += 1
        if lineCounter % 100 == 0:
            print "Parsed %s lines" % (lineCounter)

    con.commit()
    inLogFd.close()

    if havijParser:
        print "Parsing Havij attack"
        havijParse(cur, con)

    if knownGood:
        print "Comparing to known good"
        compareSqliToGood(cur, con, knownGood)

    con.close()
Exemple #34
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = OptionParser()
    parser.add_option(
        "-v",
        "--vhost",
        dest="vhostoverride",
        help="if logfile doesn't include vhost column, override it with this",
        metavar="VHOST")
    parser.add_option("-d",
                      "--debug",
                      action="store_true",
                      dest="debug",
                      help="Turn on debugging output",
                      default=False)
    (options, args) = parser.parse_args(argv)

    if options.debug:
        print >> sys.stderr, "Debug mode activated"

    OUTPUTDIR = "/opt/pysk/wwwlogs"

    # Get list of vhosts
    db = psycopg2.connect(
        "host='localhost' user='******' password='******' dbname='pysk'"
    )
    cursor = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    query = "SELECT trim(both '.' from vh.name || '.' || d.name) as vhost FROM vps_virtualhost vh, vps_domain d WHERE vh.domain_id = d.id ORDER BY vhost"
    cursor.execute(query)
    rows = cursor.fetchall()

    vhosts = {}
    for row in rows:
        vhosts[row["vhost"]] = {"logfile": None}

    # RDNS
    exitEvent = threading.Event()
    inputQueue = Queue.Queue()
    resolvedIPDict = {}
    outputDict = {}
    workers = [
        ResolveThread(inputQueue, outputDict, exitEvent)
        for i in range(0, 100)
    ]
    for worker in workers:
        worker.start()

    # Log formats
    p_igowo = apachelog.parser(apachelog.formats["igowo"])
    p_vhextendedio = apachelog.parser(apachelog.formats["vhextendedio"])
    p_vhextended = apachelog.parser(apachelog.formats["vhextended"])
    p_extended = apachelog.parser(apachelog.formats["extended"])

    for fname in glob(os.path.join(OUTPUTDIR, "inbox") + "/*"):
        fname = os.path.realpath(fname)
        print "Processing %s ..." % (fname, )
        with open(fname, "rb") as f:
            for line in f:
                # Try to parse line
                try:
                    try:
                        data = p_igowo.parse(line)
                    except apachelog.ApacheLogParserError:
                        try:
                            data = p_vhextendedio.parse(line)
                        except apachelog.ApacheLogParserError:
                            try:
                                data = p_vhextended.parse(line)
                            except apachelog.ApacheLogParserError:
                                if options.vhostoverride:
                                    data = p_extended.parse(line)
                                    data["%v"] = options.vhostoverride
                                else:
                                    raise

                    if cursor != None:
                        vhost = data["%v"]

                        if not vhost in vhosts:
                            continue

                        # Create a new logfile if we don't already have done so
                        if vhosts[vhost]["logfile"] is None:
                            vhosts[vhost]["logfile"] = NamedTemporaryFile(
                                prefix=vhost,
                                dir=os.path.join(OUTPUTDIR, "temp"))
                        logfile = vhosts[vhost]["logfile"]

                        #if "%A" in data:
                        #    local_ip = data["%A"]
                        #else:
                        #    local_ip = ""

                        if "%D" in data:
                            utime = data["%D"]
                            # stored in milliseconds instead of microseconds?
                            if '.' in utime:
                                utime = int(float(utime) * 1000)
                        else:
                            utime = None
                        r_host = data["%h"]

                        # Resolve the host, take measures to not resolve the same IP twice
                        if not r_host in resolvedIPDict:
                            resolvedIPDict[r_host] = True
                            inputQueue.put(r_host)

                        #r_logname = data["%l"]
                        r_user = data["%u"]
                        req_dt = apachelog.parse_date(data["%t"])

                        request = data["%r"]
                        status = int(data["%>s"])

                        if data["%b"] != "-":
                            response_size = int(data["%b"])
                        else:
                            response_size = 0

                        referer = data["%{Referer}i"]
                        user_agent = data["%{User-Agent}i"]

                        if "%I" in data:
                            bytes_recv = int(data["%I"])
                        else:
                            bytes_recv = None

                        if "%O" in data:
                            bytes_sent = int(data["%O"])
                        else:
                            bytes_sent = None

                        # Build logline
                        logline = u'%010d %s - %s [%s +0000] "%s" %s %s "%s" "%s"' % (
                            time.mktime(req_dt.timetuple()), r_host, r_user,
                            req_dt.strftime("%d/%b/%Y:%H:%M:%S"), request,
                            status, response_size, referer, user_agent)
                        # If input/output bytes available, append them
                        if bytes_recv and bytes_sent:
                            logline += " %s %s" % (
                                bytes_recv,
                                bytes_sent,
                            )

                        logfile.write(logline.encode("utf-8") + "\n")
                except UnicodeDecodeError:
                    if options.debug:
                        print >> sys.stderr, "UnicodeDecodeError on line %s" % line
                except apachelog.ApacheLogParserError:
                    if options.debug:
                        print >> sys.stderr, "ApacheLogParserError on line %s" % line
                except:
                    sys.stderr.write("Unable to parse %s" % line)
                    raise
        # Delete the processed logfile from the inbox
        os.unlink(fname)

    # Sort logfiles by date, strip timestamp field
    for (vhname, vh) in vhosts.iteritems():
        if not vh["logfile"] is None:
            print "Sorting %s ..." % (vh["logfile"].name, )
            # Create output logfile
            (sorted_logfile_handle, sorted_logfile_name) = mkstemp(
                prefix=vhname, dir=os.path.join(OUTPUTDIR, "temp"))
            sorted_logfile = os.fdopen(sorted_logfile_handle, "w+b")

            # Process input -> output
            p1 = Popen(["sort", "-n", vh["logfile"].name], stdout=PIPE)
            p2 = Popen(["cut", "-d ", "-f2-"],
                       stdin=p1.stdout,
                       stdout=sorted_logfile)
            p1.wait()
            p2.wait()

            # Close input (deletes the file)
            vh["logfile"].close()

            # Close output and atomically move it into the "pending" directory for further processing
            sorted_logfile.close()
            pending_dir = os.path.join(OUTPUTDIR, "pending", vhname)
            if not os.path.exists(pending_dir):
                os.makedirs(pending_dir)
            timestamp = int(time.mktime(datetime.now().timetuple()))
            os.rename(sorted_logfile_name,
                      os.path.join(pending_dir,
                                   str(timestamp) + ".log"))

    # Wait until all rdns workers are finished
    exitEvent.set()
    for worker in workers:
        worker.join()

    # Generate DNS cache
    if len(outputDict) > 0:
        with open(os.path.join(OUTPUTDIR, "dnscache.txt"), "w+b") as f:
            for (ip, rdns) in outputDict.iteritems():
                f.write("%s %s\n" % (ip, rdns))

    # Delete old config files
    for f in glob("/etc/awstats/awstats.*.conf"):
        os.unlink(f)

    # Generate new config files
    for (vhname, vh) in vhosts.iteritems():
        conffile = "/etc/awstats/awstats.%s.conf" % vhname
        with open(conffile + ".new", "w") as f:
            logfilesdir = os.path.join(OUTPUTDIR, "pending", vhname, "*.log")
            f.write('LogFile="/opt/pysk/tools/logfiles/logmerge.py %s |"\n' %
                    logfilesdir)
            f.write('SiteDomain="%s"\n' % vhname)
            f.write('HostAliases="www.%s"\n' % vhname)
            f.write('DirData="/var/lib/awstats/%s/"\n' % vhname)
            f.write('Include "/etc/awstats/awstats.conf.local"\n')
        os.rename(conffile + ".new", conffile)

    # Preprocess pending logfiles before statistics run

    ## Delete empty logfiles
    call([
        "/usr/bin/find",
        os.path.join(OUTPUTDIR, "pending"), "-name", "*.log", "-size", "0",
        "-delete"
    ])

    # Run statistics

    ## Create list of vhosts which have logfiles
    vhosts_with_logs = list(
        set([
            os.path.basename(os.path.dirname(i))
            for i in glob(os.path.join(OUTPUTDIR, "pending", "*", "*.log"))
        ]))

    ## Run awstats for these vhosts
    for v in vhosts_with_logs:
        call([
            "/usr/local/awstats/wwwroot/cgi-bin/awstats.pl",
            "-config=%s" % (v, ), "-showcorrupted"
        ])

    # Finalize processed logfiles
    processed_logfiles = glob(
        os.path.join(OUTPUTDIR, "processed", "*", "*.log"))

    ## Compress with bzip2 -9
    for pl in processed_logfiles:
        call(["bzip2", "-9", pl])

    # Fix permissions of awstats directory
    call("chmod 0750 /var/lib/awstats", shell=True)
    call("chmod 0750 /var/lib/awstats/*", shell=True)
    call("chmod 0660 /var/lib/awstats/*/*", shell=True)
    call("chown pysk:http /var/lib/awstats", shell=True)
    call("chown pysk:http /var/lib/awstats/*", shell=True)
    call("chown pysk:http /var/lib/awstats/*/*", shell=True)
    call("find /var/lib/awstats/ -name \"*.tmp.*\" -delete", shell=True)
Exemple #35
0
                            help="log file we're reading requests from")

    parser.add_argument('-n', '--number', dest='count',
                            action='store',
                            default=10,
                            type=int,
                            required=False,
                            help="how many urls do we wish to see, default 10")


    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    parse = apachelog.parser(LOG_FMT)
    res = []

    with open(args.log) as log:
        for l in log:
            try:
                l = parse.parse(l.strip())
                rtime, url = l['%x'], l['%r']
                res.append((url, rtime))
            except Exception, exc:
                print str(exc)

    for r in sorted(res, key=itemgetter(1), reverse=True)[0:args.count]:
        print "{1} - {0}".format(*r)
Exemple #36
0
# STEP 5.1. read apache log dir
for pathApacheLog, subFolders, files in os.walk(pathApacheLog):
    for file in files:
        ext = os.path.splitext(pathApacheLog+'/'+file)[1]
        
        if ext == '.log':
            print '[{0}] - pid:{1} - read log-file: {2}'.format(datetime.datetime.now(), readPid, pathApacheLog+'/'+file)
            currlogfile = pathApacheLog+'/'+file
            #print pathApacheLog+'/'+file

            # STEP 5.2. parse log file
            ############
            # Format copied and pasted from Apache conf - use raw string + single quotes
            format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
            p = apachelog.parser(format)

            # Common Log Format (CLF)
            #p = apachelog.parser(apachelog.formats['common'])
            
            # Common Log Format with Virtual Host
            #p = apachelog.parser(apachelog.formats['vhcommon'])
            
            # NCSA extended/combined log format
            #p = apachelog.parser(apachelog.formats['extended'])
            
            for line in open(currlogfile):
                try:
                    data = p.parse(line)
            
                    post_data = {"hash":BADBOT_NODE_HASH, "host":b64encode(data['%h']), "useragent":b64encode(data['%{User-Agent}i']),"time":b64encode(data['%t'])}
	def handle(self, *args, **options):
		if len(args) != 1:
			print "Missing argument."
			return

		min_count = int(args[0])

		p = apachelog.parser(logformat)
		
		spider = { }
		
		for line in sys.stdin:
			# Parse the acess log line.
			try:
				data = p.parse(line)
			except:
				continue
			
			# Is it a request to a bill page?
			path = data["%r"].split(" ")[1]
			m = re_bill.match(path)
			if not m: continue
			
			# Who is the referrer?
			ref = data["%{Referer}i"]
			if ref in ("", "-") or "govtrack.us" in ref:
				continue
			
			url = urlparse.urlparse(ref)
			hostname = url.hostname
			qs = urlparse.parse_qs(url.query)
			
			if not hostname: continue
			
			# Filter out known useless domains.
			if hostname in ("t.co", "longurl.org", "ow.ly", "bit.ly", "www.facebook.com", "www.weblinkvalidator.com", "static.ak.facebook.com", "info.com", "altavista.com", "tumblr.com", "www.freerepublic.com", "www.reddit.com"): continue
			if hostname.endswith(".ru"): continue
			
			# For referrals from Google, look at the 'q' argument to see how
			# people are searching for this page.
			if hostname.replace("www.", "").replace("search.", "") in ("google.com", "bing.com", "aol.com", "yahoo.com"):
				# todo, some use q= some use query=
				#print qs.get("q", [""])[0]
				continue
				
			# Filter out other domains if the link has a 'q' argument since it's probs
			# a search engine.
			if "q" in qs or "pid" in qs: continue
			
			# Filter out common paths for message boards.
			if "/threads/" in ref or "/forum/" in ref or "viewtopic.php" in ref: continue
				
			key = (m.groups(), url)
			spider[key] = spider.get(key, 0) + 1
			
		###
		
		first_print = True
		
		spider = spider.items()
		spider.sort(key = lambda kv : kv[1])
		for (bill_info, referral_url), count in spider:
			if count < min_count: continue # filter out referrers that occurred too infrequently
			
			bill_type = BillType.by_slug(bill_info[1])
			bill = Bill.objects.get(congress=bill_info[0], bill_type=bill_type, number=bill_info[2])
			
			lnk, is_new = BillLink.objects.get_or_create(
				bill=bill,
				url=referral_url.geturl(),
				defaults={
					"title": "Title Not Set"
				})
			
			# Additional processing for new entries.
			
			if not is_new: continue
			
			try:
				stream = urllib.urlopen(referral_url.geturl())
				if stream.getcode() != 200: continue
				dom = lxml.etree.parse(stream, lxml.etree.HTMLParser())
			except:
				continue
				
			title = dom.xpath('string(head/title)').strip()
			if title == "": continue
			
			# set the title of the scraped page
			lnk.title = title
			
			# white-list some domains, provided we were able to
			# get a title
			if referral_url.hostname in ("en.wikipedia.org", "www.truthorfiction.com", "www.theatlantic.com", "www.snopes.com", "arstechnica.com"):
				lnk.approved = True
			else:
				if first_print:
					print "Links pending approval:"
					print
					first_print = False
				print referral_url.geturl()
				print title.encode("utf8")
				print unicode(bill).encode("utf8")
				print
			
			lnk.save()
Exemple #38
0
"""

import sys
import os
import apachelog
import optparse
import urlparse
import cgi
import logging
import pdb

from datetime import datetime, time, timedelta

# Format copied and pasted from Apache conf - use raw string + single quotes
apache_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
apache_parser = apachelog.parser(apache_format)

logging.basicConfig(level=logging.INFO)


def main():
    usage = """ %prog WAV_FILENAME LOG_FILENAME [options] """
    opt_parser = optparse.OptionParser(usage=usage)

    opt_parser.add_option(
        "-d",
        "--delays",
        dest="delays_filename",
        help="An optional argument to"
        " specify a TSV file which contains the delays for each stage.")
    opt_parser.add_option("-c",
from __future__ import print_function
import sys
import json
import apachelog
import logging
from conf import settings

logger = logging.getLogger(__name__)

parser = apachelog.parser(apachelog.formats['extended'])

mappings = settings.APACHE_FIELD_MAPPINGS


def record_filter(record):
    """Return the record if it matches certain filters, otherwise None."""
    if not record.get("status") == "200":
        return None
    if not record.get("request").startswith("GET"):
        return None
    if record.get("ip_address") in ['127.0.0.1', '::1', '18.7.27.25']:
        return None
    return record


def field_mapper(request, mappings):
    """Map fields from input request dict to new dict based on mappings."""
    new_f = {}
    for k, v in request.items():
        if k in mappings:
            new_f[mappings[k]] = v
# Author: cytec <*****@*****.**>
# URL: http://github.com/cytec/SynoDLNAtrakt/
#
# This file is part of SynoDLNAtrakt.

import apachelog, os, sys, re

accesslog = "logparser.log"

p = apachelog.parser(apachelog.formats['lighttpd'])
time_format = "[%d/%b/%Y:%H:%M:%S +0200]"
regex = ".*(?P<theid>\d{5})\.(?P<ext>\w{3,5})"

testlist = ["mkv","mp4","avi"]

for line in open(accesslog):
	try:
		data = p.parse(line)
		x = re.match(regex, data["%r"])
		
		if x.group("ext") not in testlist:
			continue

		print x.group("theid"),x.group("ext")
       
	except:
		sys.stderr.write("Unable to parse %s" % line)
           #print "no"
Exemple #41
0
from pymongo import MongoClient
import apachelog, sys
from dateutil.parser import parse

# Format copied and pasted from Apache conf - use raw string + single quotes
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
p = apachelog.parser(format)

client = MongoClient('mongodb://localhost:27017/')
db = client.testdb

for line in open('log/access_log'):
    data = p.parse(line)
    if data['%r']:
        accessed_file = data['%r'].split()
        if (len(accessed_file) > 2):
            accessed_file_name = accessed_file[1]
        else:
            accessed_file_name = accessed_file[0]
    else:
        accessed_file_name = ''
    apache_date = data['%t'].replace("[", "").replace("]", "")
    time = parse(apache_date[:11] + " " + apache_date[12:])
    log_entry = {
        "host": data['%h'],
        "time": time,
        "path": accessed_file_name,
        "referer": data["%{Referer}i"],
        "useragent": data['%{User-Agent}i']
    }
    db.logs.insert_one(log_entry)
Exemple #42
0
	def handle(self, *args, **options):
		if len(args) != 1:
			print "Missing argument."
			return

		min_count = int(args[0])

		p = apachelog.parser(logformat)
		
		spider = { }
		
		for line in sys.stdin:
			# Parse the acess log line.
			try:
				data = p.parse(line)
			except:
				continue
			
			# Is it a request to a bill page?
			path = data["%r"].split(" ")[1]
			m = re_bill.match(path)
			if not m: continue
			
			# Who is the referrer?
			ref = data["%{Referer}i"]
			if ref in ("", "-") or "govtrack.us" in ref:
				continue
			
			url = urlparse.urlparse(ref)
			hostname = url.hostname
			qs = urlparse.parse_qs(url.query)
			
			if not hostname: continue
			
			# Filter out known useless domains.
			if hostname in ("t.co", "longurl.org", "ow.ly", "bit.ly", "www.facebook.com", "www.weblinkvalidator.com", "static.ak.facebook.com", "info.com", "altavista.com", "tumblr.com", "www.freerepublic.com", "www.reddit.com"): continue
			if hostname.endswith(".ru"): continue
			
			# For referrals from Google, look at the 'q' argument to see how
			# people are searching for this page.
			if hostname.replace("www.", "").replace("search.", "") in ("google.com", "bing.com", "aol.com", "yahoo.com"):
				# todo, some use q= some use query=
				#print qs.get("q", [""])[0]
				continue
				
			# Filter out other domains if the link has a 'q' argument since it's probs
			# a search engine.
			if "q" in qs or "pid" in qs: continue
			
			# Filter out common paths for message boards.
			if "/threads/" in ref or "/forum/" in ref or "viewtopic.php" in ref: continue
				
			key = (m.groups(), url)
			spider[key] = spider.get(key, 0) + 1
			
		###
		
		first_print = True
		
		spider = spider.items()
		spider.sort(key = lambda kv : kv[1])
		for (bill_info, referral_url), count in spider:
			if count < min_count: continue # filter out referrers that occurred too infrequently
			
			bill_type = BillType.by_slug(bill_info[1])
			bill = Bill.objects.get(congress=bill_info[0], bill_type=bill_type, number=bill_info[2])
			
			lnk, is_new = BillLink.objects.get_or_create(
				bill=bill,
				url=referral_url.geturl(),
				defaults={
					"title": "Title Not Set"
				})
			
			# Additional processing for new entries.
			
			if not is_new: continue
			
			try:
				stream = urllib.urlopen(referral_url.geturl())
				if stream.getcode() != 200: continue
				dom = lxml.etree.parse(stream, lxml.etree.HTMLParser())
			except:
				continue
				
			title = dom.xpath('string(head/title)').strip()
			if title == "": continue
			
			# set the title of the scraped page
			lnk.title = title
			
			# white-list some domains, provided we were able to
			# get a title
			if referral_url.hostname in ("en.wikipedia.org", "www.truthorfiction.com", "www.theatlantic.com", "www.snopes.com", "arstechnica.com"):
				lnk.approved = True
			else:
				if first_print:
					print "Links pending approval:"
					print
					first_print = False
				print referral_url.geturl()
				print title.encode("utf8")
				print unicode(bill).encode("utf8")
				print
			
			lnk.save()
Exemple #43
0
if len(sys.argv) != 2:
    print "Usage: python apache_log_analysis.py <log file>"
    sys.exit(0)
else:
    apache_log = file(sys.argv[1], 'r').readlines()

#http_err=range(400,418,1)

#Example log line from the available log
ex_url_apache = '134.251.87.133 - - [24/Dec/2014:01:00:10 +0800] "GET /IN/common/print.js HTTP/1.1" 200'
nformat_apache = r'%h %l %u %t \"%r\" %>s %b'

#ex_url_ISA='104.224.147.94 Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0) https://cards.rblbank.com:443/IN/ShowImage?linkid= 302'
#nformat_ISA=r'%h \"%{User-Agent}i\" %>s'

p = apachelog.parser(nformat_apache)
log_lst = []

for line in apache_log:
    try:
        data = p.parse(line)
        log_lst.append(data['%t'] + " " + data['%b'] + " " + data['%r'] + " " +
                       data['%>s'])

    except apachelog.ApacheLogParserError:
        sys.stderr.write('Unable to read line at %s' % line)

df = pd.DataFrame(log_lst)
'''
del df ['%h']
del df ['%l']
def main():
    """
    Check files, setup log file object,
    process line,
    assemble output file
    """
    anonymize_order = LogFile(**args)
    source_file = open(anonymize_order.file_input, 'r')
    destination_file = open(anonymize_order.file_output, anonymize_order.write_mode)
    parser = apachelog.parser(anonymize_order.log_format)
    
    read_line = 1
    while read_line:
        read_line = source_file.readline()
          
        try:
            log_line = parser.parse(read_line)
            anonymize_order.line_accept()
        except apachelog.ApacheLogParserError:
            anonymize_order.line_reject()
        #pdb.set_trace()
        hash_keys = ['%h', '%a', '%u', '%l']
        #check if value is a - and skip the hashing because it's not needed            
        while hash_keys:
            hash_key = hash_keys.pop()
            if hash_key in log_line.keys():
                if log_line[hash_key] == '-':
                    pass
                else:
                    log_line[hash_key] = hashomatic(log_line[hash_key], anonymize_order.hash_salt)

        #need to reorder the writeline in the original log format
        #list of the format and prune extra chars for consistency with the apachlog object
        log_order = anonymize_order.log_format.split()
        log_order = [format_key.replace('\\', '') for format_key in log_order]
        log_order = [format_key.replace('"', '') for format_key in log_order]
            #around here somewhere we need to add quotes to the fields that have them stripped
            #request, referal, user agent
            # %r , %{Referer}, %{User-Agent}
        quote_check_list = ['%r', '%{Referer}i', '%{User-agent}i']
        while quote_check_list:
            quote_check = quote_check_list.pop()
            if quote_check in log_line.keys():
                if log_line[quote_check].startswith('"'):
                    pass
                else:
                    log_line[quote_check] = '"' + log_line[quote_check]
                
                if log_line[quote_check].endswith('"'):
                    pass
                else:
                    log_line[quote_check] = log_line[quote_check] + '"'
            
            
        write_line = []
        for format_key in log_order:
            write_line.append(log_line[format_key])
                    
        write_line.append('\n')
        write_line = ' '.join(write_line)
        destination_file.write(write_line)
    

    print 'Lines accepted: ' + str(anonymize_order.lines_accepted)
    print 'Lines rejected: ' + str(anonymize_order.lines_rejected)
def parseLog(logfile, outfile):
	"""Parses apache logs and performs lookup on where the access is from"""

	#Options
	LOG_ACCESS_DATA = False
	PRINT_PROGRESS = True
	IPINFODB_API_KEY = ''
	apache_log_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'

	p = apachelog.parser(apache_log_format)
	total_lines = str(sum(1 for line in open(logfile)))
	current_line = 1
	parsed_log = {}
	
	for line in open(logfile, 'r'):
		try:
			data = p.parse(line)
			ip = data['%h']
			if ip not in parsed_log: #new entry
				parsed_log[ip] = {}
				parsed_log[ip]['access_count'] = 1

				if LOG_ACCESS_DATA:
					parsed_log[ip].setdefault('accessed_files', []).append((data['%t'], data['%r'])) #convert to date time

				ipinfo_url = 'http://api.ipinfodb.com/v3/ip-city/?key=' + IPINFODB_API_KEY + '&ip=' + ip + '&format=json'

				try:
					req = urllib2.Request(ipinfo_url)
					response = urllib2.urlopen(req)
				except URLError as e:
					print 'TODO'

				ipinfo = json.load(response)
				parsed_log[ip]['country'] = ipinfo['countryName']
				parsed_log[ip]['city'] = ipinfo['cityName']
				parsed_log[ip]['lat'] = ipinfo['latitude']
				parsed_log[ip]['lon'] = ipinfo['longitude']

				try:
					parsed_log[ip]['hostname'] = socket.gethostbyaddr(ip)[0]
				except:
					parsed_log[ip]['hostname'] = ''

				if PRINT_PROGRESS:
					print "INSERTED: " + ip + " LINES: " + str(current_line) + "/" + total_lines

			else:
				parsed_log[ip]['access_count'] += 1
				if LOG_ACCESS_DATA:
					parsed_log[ip].setdefault('accessed_files', []).append((data['%t'], data['%r']))

				if PRINT_PROGRESS:
					print "UPDATED: " + ip + " LINES: " + str(current_line) + "/" + total_lines

		except:
			print "Parse error: %s" % line

		current_line += 1

	#output file in pretty JSON print
	with open(outfile, 'w') as output:
  		json.dump(parsed_log, output, sort_keys=True, indent=4, separators=(',', ': '))
Exemple #46
0
#!/usr/bin/python

import apachelog, sys

urls = {}


f1 = apachelog.parser(apachelog.formats['extended'])

# Format copied and pasted from Apache conf - use raw string + single quotes

format2 = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
#212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
#        200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
#        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"

format3 = r'%h %l %u %t \"%r\" %>s %b'
#10.50.180.180 - - [18/Apr/2012:01:26:10 -0700] "GET /stylesheet/media/bullet1.gif HTTP/1.1" 404 226


#p = apachelog.parser(format)
# Common Log Format (CLF)
#p = apachelog.parser(apachelog.formats['common'])
# Common Log Format with Virtual Host
#p = apachelog.parser(apachelog.formats['vhcommon'])
# NCSA extended/combined log format
f2 = apachelog.parser(format2)
f3 = apachelog.parser(format3)

for line in open('access_log'):
  try:
Exemple #47
0
 def __init__(self, path=None):
     super(ApacheLogfileParser, self).__init__(path)
     log_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}\i"'
     self.parser = apachelog.parser(log_format)
     self.api = API()
     logging.basicConfig(level=logging.INFO)
Exemple #48
0
    def _parsefile(self, logfile_obj):
        # Create a parser for this file
        parser = apachelog.parser(self.uasp_format)
        filename = logfile_obj.file_path + logfile_obj.file_name
        
        # Attempt to determine the number of lines in the log
        log = open(filename)
        for line in log:
            self.import_stats['line_count'] += 1
        print str(self.import_stats.get('line_count')) + " lines to parse. Beginning at line " + str(self.import_stats.get('import_startline')) + "\n"
        log.close()

        log = open(filename)
        
        previous_line = ""
        for line in log:
            # Update stats
            self.import_stats['line_counter'] += 1
            if self.import_stats.get('line_counter') < self.import_stats.get('import_startline'):
                # Skip through to the specified line number
                previous_line = line
                continue
            
#            # Test for duplicate log entries immediately preceding
#            if line == previous_line:
#                self._errorlog("##### DUPLICATE LINE DETECTED ##### \n" +\
#                    "Line # :" + str(self.import_stats.get('line_counter')) + "\n" +\
#                    "Line   : " + str(line) + "\n")
#                self.import_stats['duplicatecount'] += 1
#            else:
#                # Parse and store the line
#           Removing the Duplicate line detection as I can't rule out the 0.5-1% of duplicates as being invalid yet: CM 9-12-11
            self._parseline(parser, line, logfile_obj)

            # Print progress report every 500 lines.
            if (self.import_stats.get('line_counter') % 500) == 0:
                # Calculate the average rate of import for the whole process
                try: 
                    self.import_stats['import_rate'] = \
                    float(self.import_stats.get('line_counter') - self.import_stats.get('import_startline')) /\
                    float((datetime.datetime.utcnow() - self.import_stats.get('import_starttime')).seconds)
                except ZeroDivisionError:
                    self.import_stats['import_rate'] = 1
                # Calculate how long till finished
                try: 
                    efs = int(
                        float(self.import_stats.get('line_count') - self.import_stats.get('line_counter')) /\
                        float(self.import_stats.get('import_rate'))
                    )
                except ZeroDivisionError:
                    efs = 1
                efhr = efs // (60*60)
                efs = efs % (60*60)
                efmin = efs // 60
                efsec = efs % 60
                efstring = str(efhr) + "h " + str(efmin) + "m " + str(efsec) + "s."
                
                # Output the status
                print str(datetime.datetime.utcnow()) + ": " +\
                    str((float(self.import_stats.get('line_counter')) / float(self.import_stats.get('line_count')))*100)[0:5] + "% completed. " +\
                    "Parsed " + str(self.import_stats.get('line_counter')) + " lines. " +\
                    "Duplicates: " + str(self.import_stats.get('duplicatecount')) + ". " +\
                    "Rate: " + str(self.import_stats.get('import_rate'))[0:6] + " lines/sec. " +\
                    "Est. finish in " + efstring
                    
                # Write the error cache to disk
                self._error_log_save()

            # Update duplicate line string for next pass
            previous_line = line
            
        return None
Exemple #49
0
def main():
    import gdw.stats.scripts
    parseZCML(gdw.stats.scripts, 'scripts.zcml')
    db = getUtility(IDatabase, 'postgres')
    session = db.session
    initialize_declarative_mappers(DeclarativeBase, db.metadata)
    initialize_defered_mappers(db.metadata)
    p = apachelog.parser(FORMAT)
    logfilePath, website = getConfig()
    maxDate = getMaxDate(website).max_1
    if maxDate is None:
        maxDate = datetime(1970, 1, 1)
    cpt = 0
    for line in open(logfilePath):
        try:
            data = p.parse(line)
        except:
            continue
        if data is None:
            continue
        date = apachelog.parse_date(data['%t'])
        date = datetime(*time.strptime(date[0], '%Y%m%d%H%M%S')[:6])
        if date <= maxDate:
            continue
        code = data['%>s']
        if int(code) not in VALID_HTTP_CODE:
            continue
        path = re.match('(.*) (.*) (.*)', data['%r']).group(2)
        path = urlparse.urlparse(path)[2]
        # path : '/hebergement/logement/beauraing/hebid'
        if len(path.strip('/').split('/')) != 4:
            continue
        if path.lstrip('/').split('/')[0] != 'hebergement':
            continue
        if path.endswith('/view'):
            continue
        if path.endswith('/gallery'):
            continue
        hebid = path.rstrip('/').split('/')[-1]
        if os.path.splitext(hebid)[1] in SKIP_TYPES:
            continue
        if hebid.lower() in ['robots.txt',
                             '/misc_/ExternalEditor/edit_icon']:
            continue
        if 'manage_' in hebid.lower():
            continue
        if '/p_/' in path.lower():
            continue
        if '/misc_/' in path.lower():
            continue
        agent = data['%{User-Agent}i']
        stop = False
        agent_lower = agent.lower()
        for robot in ROBOTS:
            if robot in agent_lower:
                stop = True
                break
        if stop:
            continue
        host = data['%h']
        if host in SKIP_HOSTS:
            continue
        cpt += 1
        if cpt % 10000 == 0:
            session.flush()
            session.commit()
            print cpt
        heb_pk = getHebPkFromId(hebid)
        logline = LogItem()
        logline.log_date = date
        logline.log_path = path
        logline.log_hebid = hebid
        logline.log_hebpk = heb_pk
        logline.log_host = host
        logline.log_agent = agent
        logline.log_website = website
        session.add(logline)
        maxDate = date
    session.flush()
    session.commit()
def parse_apache_logfile(db_conn, logfile, start_date):
    # A list of extensions used to decide which files are counted as hits
    valid_extensions = [
        ".tar.gz", ".dmg", ".exe", ".zip", ".tgz", ".tbz", ".bz2"
    ]
    format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
    p = apachelog.parser(format)
    num_lines = 0
    ignored_old_entries = 0
    next_progress = time.time() + 3
    for line in logfile:
        if time.time() > next_progress:
            next_progress = time.time() + 3
            print("line", num_lines)
        num_lines += 1
        try:
            data = p.parse(line)
            code = int(data['%>s'])
        except:
            code = -1
        # Get the code, if it's not 200 (success) we ignore it
        if code == 200:
            # Break the request URL into / separated pieces
            request_data = data['%r'].split()
            if len(request_data) <= 1: continue
            request_url = request_data[1]
            breakdown = request_url.split('/')
            # If the URL starts with /cig/software or /cig/software/github
            if len(breakdown) >= 4 and breakdown[1] == 'cig' and breakdown[
                    2] == 'software':
                # Then the third subpath is the code name (fourth if it's a github redirect)
                if breakdown[3] == 'github':
                    cig_code = breakdown[4]
                else:
                    cig_code = breakdown[3]
                # Assume the file name is the final path element
                url_end = breakdown[-1]
                valid = False
                # Confirm that the file name ends with one of the valid extensions
                for ext in valid_extensions:
                    if url_end.endswith(ext):
                        valid = True
                        break

                # Get the timestamp of the request
                timestamp = data['%t'].split()[0]
                parsed_ts = datetime.datetime.strptime(timestamp,
                                                       "[%d/%b/%Y:%H:%M:%S")
                if parsed_ts < start_date:
                    ignored_old_entries += 1
                elif valid:
                    # Now we have confirmed this hit is a valid download
                    # First ensure the corresponding filename and package are in the DB
                    add_file_package(db_conn, url_end, cig_code)
                    # First we have to convert the IP address to an IP number
                    host_name = data['%h']
                    try:
                        ip_addr = socket.gethostbyname(host_name)
                    except:
                        continue
                    # Get the IP address associated with the host name
                    ip_num = ip_addr_to_ip_num(ip_addr)
                    # Then add the hit in the database
                    add_hit(db_conn, url_end, cig_code, ip_num, parsed_ts)

    return ignored_old_entries
logfiles = os.listdir(LOGS_DIR)

for file in logfiles:
    fileloaded = open(LOGS_DIR+"/"+file, 'r')
    cf = "country_"+file
    if proc_files.find({'_id':cf}).count() == 0:
        pfpost = {} 
        pfpost['proc_date'] = date.isoformat(date.today())
        pfpost['_id'] = cf
        proc_files.insert(pfpost)
        print "processing "+file
        count=0
        for line in fileloaded:
            if "GET /scielo.php?script=" in line:
                count=count+1
                p = apachelog.parser(APACHE_LOG_FORMAT)
                try:
                    data = p.parse(line)
                except:
                    sys.stderr.write("Unable to parse %s" % line)

                if MONTH_DICT.has_key(data['%t'][4:7].upper()):
                    month = MONTH_DICT[data['%t'][4:7].upper()]
                else:
                    continue
                
                dat = data['%t'][8:12]+month
                url = data['%r'].split(' ')[1]
                ip = data['%h']
                
                i2pc = IP2Country(verbose=False)
Exemple #52
0
inputfn = sys.argv[1]
#log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
#                      '$status $body_bytes_sent "$http_referer" '
#                      '"$http_user_agent" "$http_x_forwarded_for" "$request_time" "$host" "${app_key}-${receiver_type}" "$versionno"' ;

#log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
#                      '$status $body_bytes_sent "$http_referer" '
#                      '"$http_user_agent" "$http_x_forwarded_for" "$upstream_addr"'

#format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{X-FORWARDED-FOR}i\" %T %U'
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{X-FORWARDED-FOR}i\" %T'
#format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" "%a" %T'
format2 = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
format3 = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %a %T %U'
#%h %t %>s %b %T
p = apachelog.parser(format)
p2 = apachelog.parser(format2)
p3 = apachelog.parser(format3)

if False == os.path.exists(inputfn):
    print 'input file not exists'
    sys.exit(0)

f = open(inputfn, 'r')
while True:
    line = f.readline()
    if not line:
        break
    try:
        d = p.parse(line)
        has_rqstime = 1