def parse_apachelog(filename): print "Parsing file {}".format(filename) headers = r'\"%{X-Forwarded-For}i\" %h \"%{bg_node}e\" %{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t %P \"%r\" %>s %a %b %I %D %f %O \"%{Referer}i\" \"%{User-Agent}i\" %{bg_xact_id}n %{bg_user_id}n %{bg_app_id}n \"%{bg_device_type}n\" %{bg_usage_type}n %{bg_severity}n \"%{bg_email_to}n\" \"%{bg_email_from}n\" \"%{bg_email_cc}n\" \"%{bg_email_subject}n\" \"%{bg_email_date}n\" %{bg_file_name}n %{bg_file_ext}n %{bg_keywords}n %{bg_latitude}n %{bg_longitude}n %{bg_accuracy}n \"%{bg_mime_type}n\" \"%{bg_page_title}n\" \"%{bg_field1}n\" \"%{bg_field2}n\" \"%{bg_field3}n\" \"%{bg_field4}n\" \"%{bg_field5}n\" \"%{bg_field6}n\" \"%{bg_field7}n\" \"%{bg_field8}n\" \"%{bg_field9}n\" \"%{bg_field10}n\"' p = apachelog.parser(headers) log_list = [] with open(filename) as f: for line in f.readlines(): line = line.replace(' ', ' - - - ') # line = line.replace(' - -', '') timestamp = re.search( r'([0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}T[0-9]{2,2}:[0-9]{2,2}:[0-9]{2,2}.[0-9]{1,6}\+[0-9]{1,4})', line) try: new_timestamp = '[' + timestamp.group(0) + ']' except AttributeError: print( "Line '{0}' doesn't contain a timestamp. Skipping the line" .format(line)) continue line = line.replace(timestamp.group(0), new_timestamp) # print ("Parsing line {0}".format(line)) try: data = p.parse(line) except apachelog.ApacheLogParserError: sys.stderr.write("Unable to parse: %s" % line) data['%{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t'] = data[ '%{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t'][1:11] + ' ' + data[ '%{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t'][12:27] # + \ # ' ' + data['%{%Y-%m-%dT%H:%M:%S}t.%{usec_frac}t%{%z}t'][ # 27:32] log_list.append(data) print "Length of currnet parsed list '{}' is: {}".format( filename, len(log_list)) return log_list
def run(inLog, dbFile, website, havijParser, compareToGood, knownGood, cookie, logFormat=""): apacheParser = apachelog.parser(logFormat) inLogFd = open(inLog) lineCounter = 1 con = sqlite3.connect(dbFile) con.row_factory = sqlite3.Row con.text_factory = str cur = con.cursor() cur.execute("select * from sqlite_master") dropTable = False for row in cur.fetchall(): if row["name"] == "sqlInjectedReturns": dropTable = True if dropTable: cur.execute("DROP TABLE sqlInjectedReturns") cur.execute( "CREATE TABLE IF NOT EXISTS sqlInjectedReturns(id INTEGER PRIMARY KEY AUTOINCREMENT,\ request TEXT,\ returnVal TEXT)" ) for line in inLogFd: try: lineData = apacheParser.parse(line) except: print "Could not parse data on line %s, error: %s" % (line, sys.exc_info()) lineCounter += 1 continue urlToGet = website + lineData["%r"].split(" ")[1] urlHeaders = {} if "%{User-Agent}i" in lineData: urlHeaders["User-Agent"] = lineData["%{User-Agent}i"] if cookie: urlHeaders["Cookie"] = cookie try: urlRequest = urllib2.Request(urlToGet, None, urlHeaders) urlGetter = urllib2.urlopen(urlRequest) urlData = urlGetter.read() cur.execute("INSERT INTO sqlInjectedReturns(request, returnVal) Values (?,?)", [lineData["%r"], urlData]) except: print "Could not get data for url %s" % (urlToGet) lineCounter += 1 if lineCounter % 100 == 0: print "Parsed %s lines" % (lineCounter) con.commit() inLogFd.close() if havijParser: print "Parsing Havij attack" havijParse(cur, con) if knownGood: print "Comparing to known good" compareSqliToGood(cur, con, knownGood) con.close()
def readAndParse(filenames): p = apachelog.parser(apachelog.formats['extended']) for filename in filenames: if os.path.splitext(filename)[1] == '.gz': fp = gzip.open(filename, 'rb') else: fp = open(filename) for line in fp: if not bitstreamRE.search(line): # if no bitstream ID, don't go any further continue try: data = p.parse(line) except apachelog.ApacheLogParserError: print >> sys.stderr, "failed to parse", line, continue yield dict(requestor=data['%h'], bytes=data['%b'], request=data['%r'], time=data['%t'].strip('[]'), user=data['%u'], result=data['%>s'], useragent=data['%{User-agent}i'], referer=data['%{Referer}i'])
def _parse_file(self, storage_key, file_path, read_from_start=False, read_to_time=None): """ Read recent part of the log file, update statistics storages and adjust seek. If only file parameter is supplied, read file from self.seek to the end. If the file is not found or cannot be read, log an error and return. @param str storage_key: a key to define statistics storage @param string file_path: path to file for parsing @param bool read_from_start: if true, read from the beginning of file, otherwise from `self.seek` @param datetime read_to_time: if set, records are parsed until their time is greater or equal of parameter value Otherwise the file is read till the end. """ with open(file_path, 'r') as f: if read_from_start: logger.debug('Reading file %s from the beginning to %s' % (file_path, read_to_time)) else: logger.debug('Reading file %s from position %d to %s' % (file_path, self.seek[file_path], read_to_time or 'the end')) if not read_from_start: f.seek(self.seek[file_path]) logger.debug('Setting seek for file %s to %d based on a value from the storage' % (f.name, self.seek[file_path])) log_parser = apachelog.parser(getattr(settings, 'ELF_FORMAT', '')) while True: current_seek = f.tell() line = f.readline() if not line: #Reached end of file, record seek and stop self.seek[file_path] = current_seek logger.debug('Reached end of file %s, set seek in storage to %d' % (f.name, current_seek)) break record = utils.parse_line(line, log_parser, getattr(settings, 'LATENCY_IN_MILLISECONDS', False)) if not record: self._count_record(storage_key, 'error') continue record_time = record.get_time() if record_time is None: logger.error('Could not process time string: ' + record.time) logger.error('Line: ' + record.line) self._count_record(storage_key, 'error') continue if read_to_time and record_time >= read_to_time: #Reached a record with timestamp higher than end of current analysis period #Stop here and leave it for the next invocation. self.seek[file_path] = current_seek logger.debug('Reached end of period, set seek for %s in storage to %d' % (f.name, current_seek)) break status = self._process_record(storage_key, record) self._count_record(storage_key, status)
def test_empty_response_code(self, monkeypatch): utils_setup(monkeypatch) line = u'172.19.0.40 - - [08/Aug/2013:10:59:59 +0200] "POST /content/csl/contentupdate/xxx HTTP/1.1" "" 8563 ' \ u'"-" "Apache-HttpClient/4.2.1 (java 1.5)" community1 community1 OK 14987 8785 53047' parser = apachelog.parser(settings.ELF_FORMAT) record = parse_line(line, parser) assert record is None
def test_empty(self, monkeypatch): utils_setup(monkeypatch) line = u'' parser = apachelog.parser(settings.ELF_FORMAT) record = parse_line(line, parser) assert record is None
def test_wrong_request(self, monkeypatch): utils_setup(monkeypatch) line = u'172.19.0.40 - - [08/Aug/2013:10:59:59 +0200] "\x80w\x01\x03\x01" 200 8563 "-" ' \ u'"Apache-HttpClient/4.2.1 (java 1.5)" community1 community1 OK 14987 8785 53047' parser = apachelog.parser(settings.ELF_FORMAT) record = parse_line(line, parser) assert record is None
def test_with_latency_in_microseconds(self, monkeypatch): utils_setup(monkeypatch) line = u'172.19.0.40 - - [08/Aug/2013:10:59:59 +0200] "POST /content/csl/contentupdate/xxx HTTP/1.1" 200 8563 '\ u'"-" "Apache-HttpClient/4.2.1 (java 1.5)" community1 community1 OK 14987 8785 1253' parser = apachelog.parser(settings.ELF_FORMAT) record = parse_line(line, parser, False) assert record.latency == 1
def __init__(self, collection=None, counter_compliant=False): self._parser = apachelog.parser(APACHE_LOG_FORMAT) allowed_collections = self._allowed_collections() if collection not in allowed_collections: raise ValueError('Invalid collection id ({0}), you must select one of these {1}'.format(collection, str(allowed_collections))) self.collection = collection self.acronym_to_issn_dict = self._acronym_to_issn_dict() self.allowed_issns = self._allowed_issns(self.acronym_to_issn_dict)
def analyze(self): """ options include '--totals-only' """ requests_number = 0 # number of requests for a current second previous_timestamp = '' # timestamp from a previous line #TODO: add logic to deal with possible not continuous equal timestamps for line in self.logfile: try: p = apachelog.parser(apachelog.formats[self.format]) data = p.parse(line) except: sys.stderr.write("Unable to parse line: %s" % line) raise #TODO: extend options (--ignore-bad-lines) # getting the timestamp try: timestamp = data.get('%t') except: sys.stderr.write("Unable to get data: %s" % data) raise #TODO: extend options (--ignore-parse-errors) if timestamp == previous_timestamp: requests_number += 1 else: if (self.options != '--totals-only') and (previous_timestamp != ''): # print summary for the previous second s = "%s = %s" % (previous_timestamp, requests_number) print s self.secondsSum += 1 self.requestsSum += requests_number if requests_number > self.maxRequestsPerSec: self.maxRequestsPerSec = requests_number if requests_number < self.minRequestsPerSec: self.minRequestsPerSec = requests_number if self.minRequestsPerSec == 0: self.minRequestsPerSec = requests_number requests_number = 1 previous_timestamp = timestamp self.logfile.close self.avgRequestsPerSec = float(self.requestsSum) / self.secondsSum return
def test_valid(self, monkeypatch): utils_setup(monkeypatch) line = u'172.19.0.40 - - [08/Aug/2013:10:59:59 +0200] "POST /data/csl/contentupdate/xxx HTTP/1.1" 200 8563 '\ u'"-" "Apache-HttpClient/4.2.1 (java 1.5)" community1 community1 OK 14987 8785 53047' parser = apachelog.parser(settings.ELF_FORMAT) record = parse_line(line, parser) assert record.raw_request == '/data/csl/contentupdate/xxx' assert record.get_time() == datetime.datetime.strptime('20130808105959', log_record.APACHELOG_DATETIME_FORMAT) assert record.response_code == 200 assert record.latency == 53 assert record.get_method_id() == 'csl_contentupdate'
def apache_2_sql ( logfile ): "Convert apache logfile to mysql table" p = apachelog.parser(apachelog.formats['extended']) for line in open( logfile ): try: data = p.parse(line) except: sys.stderr.write("Unable to parse %s" % line) converted_date = apachelog.parse_date(data['%t']) converted_request = data['%r'].lower().strip() print data return
def apache_2_sql(logfile): "Convert apache logfile to mysql table" p = apachelog.parser(apachelog.formats['extended']) for line in open(logfile): try: data = p.parse(line) except: sys.stderr.write("Unable to parse %s" % line) converted_date = apachelog.parse_date(data['%t']) converted_request = data['%r'].lower().strip() print data return
def __init__(self, log_file_location, log_format, period, alert_period, alert_threshold, max_frequent_sections): ''' Constructor ''' self._log_file_location = log_file_location self._parser = apachelog.parser(apachelog.formats[log_format]) self._period = period self._alert_period = alert_period self._alert_threshold = alert_threshold self._max_frequent_sections = max_frequent_sections self._log_file_position = 0 self._log_cache = [] self._alert_state = False self._alerts = []
def __init__(self, collection=None, counter_compliant=False, allowed_collections=_allowed_collections, acronym_to_issn_dict=_acronym_to_issn_dict): self._parser = apachelog.parser(APACHE_LOG_FORMAT) allowed_collections = allowed_collections() if collection not in allowed_collections: raise ValueError( 'Invalid collection id ({0}), you must select one of these {1}' .format(collection, str(allowed_collections))) self.collection = collection self.acronym_to_issn_dict = acronym_to_issn_dict(self.collection) self.allowed_issns = self._allowed_issns(self.acronym_to_issn_dict)
def create_urls(logfile, outfile, logformat, grep=None): parser = apachelog.parser(logformat) with open(logfile) as f, open(outfile, 'w') as o: writer = csv.writer(o) # Status spinner spinner = "|/-\\" pos = 0 for i, line in enumerate(f): # Spin the spinner if i % 10000 == 0: sys.stdout.write("\r" + spinner[pos]) sys.stdout.flush() pos += 1 pos %= len(spinner) # If a filter was specified, filter by it if grep and not grep in line: continue try: data = parser.parse(line) except apachelog.ApacheLogParserError as e: print(e) continue if data[STATUS_CODE] != '200': continue method, url, protocol = data[REQUEST].split() # Check for GET requests with a status of 200 if method != 'GET': continue # Exclude media requests and special urls if MEDIA_RE.search(url) or SPECIAL_RE.search(url): continue # This is a good record that we want to write writer.writerow([url, data[USER_AGENT]]) print(' done!')
def get_seek(file_path, period_start): """ Given a file path, find a position in it where the records for a tracked period start. @param str file_path: path to log file to seek @param datetime period_start: timestamp for the beginning of the tracked period @return int seek """ f = open(file_path, 'r') log_parser = apachelog.parser(getattr(settings, 'ELF_FORMAT', '')) size = os.stat(file_path).st_size logger.debug('Running get_seek() for file %s' % f.name) approximate_seek = _find_approximate_seek_before_period_by_moving_back(f, size, log_parser, period_start) logger.debug('approximate seek for %s is set to %d' % (f.name, approximate_seek)) exact_seek = _find_exact_seek_before_period_by_moving_forward(f, log_parser, approximate_seek, period_start) logger.debug('exact seek for %s is set to %d' % (f.name, exact_seek)) f.close() return exact_seek
def create_urls(logfile, outfile, logformat, grep=None): parser = apachelog.parser(logformat) with open(logfile) as f, open(outfile, 'wb') as o: writer = csv.writer(o) # Status spinner spinner = "|/-\\" pos = 0 for i, line in enumerate(f): # Spin the spinner if i % 10000 == 0: sys.stdout.write("\r" + spinner[pos]) sys.stdout.flush() pos += 1 pos %= len(spinner) # If a filter was specified, filter by it if grep and not grep in line: continue try: data = parser.parse(line) except apachelog.ApacheLogParserError: continue try: method, url, protocol = data[REQUEST].split() except ValueError: #print "Line %d: Unable to split" % i continue # Check for GET requests with a status of 200 if method != 'GET' or data[STATUS_CODE] != '200': continue # Exclude media requests and special urls if MEDIA_RE.search(url) or SPECIAL_RE.search(url): continue # This is a good record that we want to write writer.writerow([url, data[USER_AGENT]]) print ' done!'
def __init__(self): self.format = r'%{owner}i %{bucket}i %{datetime}t %{ip}h %{requester}i %{requestid}i %{operation}i %{key}i \"%{requesturi}i\" %{status}s %{error}i %{bytes}b %{objectsize}i %{totaltime}i %{turnaround}i \"%{referer}i\" \"%{useragent}i\" %{versionid}i' self.parser = apachelog.parser(self.format) self.log_mapper = { #'%{owner}i':('owner',None), '%{datetime}t':('logged_time',lambda dtstr: datetime.strptime(dtstr[:-6].strip('[]'),'%d/%b/%Y:%H:%M:%S ')), '%{ip}h':('ip',None), '%{requestid}i':('request_id',None), '%{operation}i':('operation',None), '%{key}i':('key',None), '%{requesturi}i':('uri',None), '%{requestid}i':('_id',None), '%{status}s':('status',int), '%{error}i':('error',None), '%{bytes}b':('bytes',int), '%{versionid}i':('version_id',None), '%{objectsize}i':('object_size',int) }
def processFile(path): logparser = alog.parser(alog.formats['common']) hosts = list() dates = list() statuses = list() respsizes = list() with open(path, "r+b") as log: map = None try: map = mmap.mmap(log.fileno(), 0, access=mmap.ACCESS_READ) for line in iter(map.readline, ""): try: data = logparser.parse(line) hosts.append(data['%h']) cdate = dateutil.parser.parse(data['%t'].strip('[]'), fuzzy=True) dates.append(cdate) statuses.append(data['%>s']) respsizes.append(data['%b']) except: print "Unable to parse %s" % line finally: if map: map.close() index = pd.to_datetime(dates) frame = pd.DataFrame( { 'Host': pd.Series(hosts, index=index, dtype=str), 'Status': pd.Series(statuses, index=index), 'ResponseSize': pd.Series(respsizes, index=index) }, index=index) frame.to_hdf('processed.hdf', 'stat') return frame
def _whatFormat(self): #TODO: currently it doesn't work """ Verifies if a given file complies with any standard access log file format and returns the name of this format or the UnknownFormatError exception. """ try: line = self.logfile.readline() except: raise ('Cannot read the first line of the file') formats = 'extended', 'common', 'vhcommon' for f in formats: try: p = apachelog.parser(apachelog.formats[f]) data = p.parse(line) except: next else: return f raise UnknownFormatError('Unknown log file format')
def processFile(path): logparser = alog.parser(alog.formats['common']) hosts = list() dates = list() statuses = list() respsizes = list() with open(path, "r+b") as log: map = None try: map = mmap.mmap(log.fileno(), 0, access=mmap.ACCESS_READ) for line in iter(map.readline, ""): try: data = logparser.parse(line) hosts.append(data['%h']) cdate = dateutil.parser.parse( data['%t'].strip('[]'), fuzzy=True) dates.append(cdate) statuses.append(data['%>s']) respsizes.append(data['%b']) except: print "Unable to parse %s" % line finally: if map: map.close() index = pd.to_datetime(dates) frame = pd.DataFrame({ 'Host': pd.Series(hosts, index=index, dtype=str), 'Status': pd.Series(statuses, index=index), 'ResponseSize': pd.Series(respsizes, index=index)}, index=index) frame.to_hdf('processed.hdf', 'stat') return frame
def main(): if rank == 0: from optparse import OptionParser parser = OptionParser(usage = 'Usage: mpiexec -n 4 python apachelogstats.py --(addr|time) /var/log/apache2/access.log.1') parser.add_option("-t", "--time", action="store_const", dest="mode", const = '%t', help="group by time") parser.add_option("-a", "--addr", action="store_const", dest="mode", const = '%h', help="group by address") (options, args) = parser.parse_args() # Format copied and pasted from Apache conf - use raw string + single quotes format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' p = apachelog.parser(format) data = [] for i,line in enumerate(open(args[0])): try: data.append( p.parse( line ) ) except: sys.stderr.write("Unable to parse %s" % line) if i > 6: break else: data = None options = None options = comm.bcast(options, root=0) ranked_data = MR_map(data, options.mode) if rank == 0: dd = MR_reduce(ranked_data) for key, value in dd.iteritems(): print( '%s %d' % ( str(key), len(value) ) )
""" log_format tjmain '[$time_local] $status $host ' '$upstream_addr $upstream_response_time $request_time ' '$remote_addr $remote_user $request ' '$bytes_sent "$http_referer" ' '"$http_user_agent" "$http_x_forwarded_for"' [17/Jan/2012:17:00:55 +0800] 304 10.2.76.28 - - 0.000 10.2.76.25 - GET /highcharts/js/modules/exporting.js HTTP/1.1 158 "http://10.2.76.28/highcharts/examples/bar-basic.htm" "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7" "-"sendfileon format = r'%t %>s %h %uh %urt %addr %ru \"%r\" %b \"%{Referer}i\" \"%{User-Agent}i\" \"{Forward}\"' #default format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"{Forward}\"' """ format = r'%t %>s %h %ua %ub %uc %v %vu %m %mu %mv %b \"%{Referer}i\" \"%{User-Agent}i\" \"{Forward}\"' parser = apachelog.parser(format) def run(f): count_line, count_failed = 0, 0 for line in f: if not line: continue try: count_line += 1 data = parser.parse(line) data = [v for k,v in data] except apachelog.ApacheLogParserError: #logging.exception('pipeline parse') count_failed += 1 continue
def __init__(self, sink): super(CommonLogFormatParser, self).__init__(sink) self.apachelog_parser = apachelog.parser(self.format)
entry = { "_id" : ip, "ban_time" : datetime.now(), "printed" : "no" # "printed" flag is cleared } banned_ip_collection.update(spec={"_id" : ip}, document=entry, upsert=True) def is_ip_banned(ip): if not banned_ip_collection: return False return banned_ip_collection.find_one({"_id" : ip}) is not None window = None # list of slots, shifting every SLOT_INTERVAL milliseconds millis = None # last window shift, milliseconds from epoch logline_parser = parser(ACCESS_LOG_RECORD_FORMAT) # see apachelog docs for details def logrecord(logline): """Parse log line Returns values as tuple: (date, time, ip, url, agent, referrer,) """ (date, time, ip, url, agent, referrer, code) = (None, None, None, None, None, None, None) try: if logline.count('"') == 8: # remove the misterious last "-" logline = logline[:len(logline) - 3] parsed = logline_parser.parse(logline) code = parsed["%>s"] ip = parsed["%h"]
""" import sys import os import apachelog import optparse import urlparse import cgi import logging import pdb from datetime import datetime, time, timedelta # Format copied and pasted from Apache conf - use raw string + single quotes apache_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' apache_parser = apachelog.parser(apache_format) logging.basicConfig(level=logging.INFO) def main(): usage = """ %prog WAV_FILENAME LOG_FILENAME [options] """ opt_parser = optparse.OptionParser(usage=usage) opt_parser.add_option("-d", "--delays", dest="delays_filename", help="An optional argument to" " specify a TSV file which contains the delays for each stage.") opt_parser.add_option("-c", "--classes", dest="classes_filename", help="An optional argument to specify a TSV file " "which contains classes for each stage.") opt_parser.add_option("-s", "--svm", action='store_true', default=False, dest="svm_output", help="Flag to also generate " "an input file suitable for input into Max's pitch, format and energy analyzer. Requires a classes file.") opt_parser.add_option("-r", "--ratings", action='store_true', default=False, dest="use_url_ratings", help="Parses the 'rating' URL parameter"
} banned_ip_collection.update(spec={"_id": ip}, document=entry, upsert=True) def is_ip_banned(ip): if not banned_ip_collection: return False return banned_ip_collection.find_one({"_id": ip}) is not None window = None # list of slots, shifting every SLOT_INTERVAL milliseconds millis = None # last window shift, milliseconds from epoch logline_parser = parser( ACCESS_LOG_RECORD_FORMAT) # see apachelog docs for details def logrecord(logline): """Parse log line Returns values as tuple: (date, time, ip, url, agent, referrer,) """ (date, time, ip, url, agent, referrer, code) = (None, None, None, None, None, None, None) try: if logline.count('"') == 8: # remove the misterious last "-" logline = logline[:len(logline) - 3] parsed = logline_parser.parse(logline)
#!/usr/bin/python import sys, os, apachelog, bandwidth, MySQLdb p = apachelog.parser( r"%v %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"") bwsum = bandwidth.Bandwidth() for file in sys.argv[1:]: print "Processing %s file" % file for line in open(file): try: data = p.parse(line.strip()) if (data['%O'] == '-'): continue bwsum.add(data['%v'], data['%O'], data['%t']) except: sys.stderr.write("Unable to parse %s" % line) connection = MySQLdb.connect(host='localhost', user='******', passwd='', db='store') bwsum.persist(connection)
# limitations under the License. # import apachelog from log_utils import parse_apache_line import os import sys # %b - Size # %h - Remote IP or Host # %l - Remote Log Name # %r - Request # %>s - HTTP Status Code # %t - eventTime # %u - Remote User # %{Referer}i - Referer # %{User-agent}i - UserAgent if len(sys.argv) == 2: log_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}\i"' parser = apachelog.parser(log_format) for line in open(sys.argv[1]): p = parse_apache_line(parser, line.strip()) print( "host: {0}, time: {1}, request: {2}, status: {3}, size: {4}, referer: {5}, agent: {6}" .format(p['%h'], p['%t'], p['%r'], p['%>s'], p['%b'], p['%{Referer}i'], p['%{User-Agent}\\i"'])) else: sys.stderr.write("usage: {0} <path>".format(os.path.basename(sys.argv[0])))
import json import apachelog import hashlib def writeRecordsToFileName (records, fileName): fullFileName = fileName + '.json' jsonFile = open(fullFileName, 'w') json.dump({'docs':records}, jsonFile) jsonFile.close() print "wrote file " + fullFileName if len(sys.argv) > 1: parser = apachelog.parser(apachelog.formats['extended']) fileNumber = 1 for fileName in sys.argv[1:]: records = [] file = open(fileName) for line in file: try: data = parser.parse(line) record = {} record['date'] = data['%t'][1:-1] record['iptrunk'] = '.'.join(data['%h'].split('.')[0:2]) hash = hashlib.md5() hash.update('smsmc0' + data['%h']) record['iphash'] = hash.hexdigest()
def run(inLog, dbFile, website, havijParser, compareToGood, knownGood, cookie, logFormat=""): apacheParser = apachelog.parser(logFormat) inLogFd = open(inLog) lineCounter = 1 con = sqlite3.connect(dbFile) con.row_factory = sqlite3.Row con.text_factory = str cur = con.cursor() cur.execute("select * from sqlite_master") dropTable = False for row in cur.fetchall(): if row['name'] == 'sqlInjectedReturns': dropTable = True if dropTable: cur.execute("DROP TABLE sqlInjectedReturns") cur.execute( "CREATE TABLE IF NOT EXISTS sqlInjectedReturns(id INTEGER PRIMARY KEY AUTOINCREMENT,\ request TEXT,\ returnVal TEXT)" ) for line in inLogFd: try: lineData = apacheParser.parse(line) except: print "Could not parse data on line %s, error: %s" % ( line, sys.exc_info()) lineCounter += 1 continue urlToGet = website + lineData['%r'].split(" ")[1] urlHeaders = {} if '%{User-Agent}i' in lineData: urlHeaders['User-Agent'] = lineData['%{User-Agent}i'] if cookie: urlHeaders['Cookie'] = cookie try: urlRequest = urllib2.Request(urlToGet, None, urlHeaders) urlGetter = urllib2.urlopen(urlRequest) urlData = urlGetter.read() cur.execute( "INSERT INTO sqlInjectedReturns(request, returnVal) Values (?,?)", [lineData['%r'], urlData]) except: print "Could not get data for url %s" % (urlToGet) lineCounter += 1 if lineCounter % 100 == 0: print "Parsed %s lines" % (lineCounter) con.commit() inLogFd.close() if havijParser: print "Parsing Havij attack" havijParse(cur, con) if knownGood: print "Comparing to known good" compareSqliToGood(cur, con, knownGood) con.close()
def main(argv=None): if argv is None: argv = sys.argv parser = OptionParser() parser.add_option( "-v", "--vhost", dest="vhostoverride", help="if logfile doesn't include vhost column, override it with this", metavar="VHOST") parser.add_option("-d", "--debug", action="store_true", dest="debug", help="Turn on debugging output", default=False) (options, args) = parser.parse_args(argv) if options.debug: print >> sys.stderr, "Debug mode activated" OUTPUTDIR = "/opt/pysk/wwwlogs" # Get list of vhosts db = psycopg2.connect( "host='localhost' user='******' password='******' dbname='pysk'" ) cursor = db.cursor(cursor_factory=psycopg2.extras.DictCursor) query = "SELECT trim(both '.' from vh.name || '.' || d.name) as vhost FROM vps_virtualhost vh, vps_domain d WHERE vh.domain_id = d.id ORDER BY vhost" cursor.execute(query) rows = cursor.fetchall() vhosts = {} for row in rows: vhosts[row["vhost"]] = {"logfile": None} # RDNS exitEvent = threading.Event() inputQueue = Queue.Queue() resolvedIPDict = {} outputDict = {} workers = [ ResolveThread(inputQueue, outputDict, exitEvent) for i in range(0, 100) ] for worker in workers: worker.start() # Log formats p_igowo = apachelog.parser(apachelog.formats["igowo"]) p_vhextendedio = apachelog.parser(apachelog.formats["vhextendedio"]) p_vhextended = apachelog.parser(apachelog.formats["vhextended"]) p_extended = apachelog.parser(apachelog.formats["extended"]) for fname in glob(os.path.join(OUTPUTDIR, "inbox") + "/*"): fname = os.path.realpath(fname) print "Processing %s ..." % (fname, ) with open(fname, "rb") as f: for line in f: # Try to parse line try: try: data = p_igowo.parse(line) except apachelog.ApacheLogParserError: try: data = p_vhextendedio.parse(line) except apachelog.ApacheLogParserError: try: data = p_vhextended.parse(line) except apachelog.ApacheLogParserError: if options.vhostoverride: data = p_extended.parse(line) data["%v"] = options.vhostoverride else: raise if cursor != None: vhost = data["%v"] if not vhost in vhosts: continue # Create a new logfile if we don't already have done so if vhosts[vhost]["logfile"] is None: vhosts[vhost]["logfile"] = NamedTemporaryFile( prefix=vhost, dir=os.path.join(OUTPUTDIR, "temp")) logfile = vhosts[vhost]["logfile"] #if "%A" in data: # local_ip = data["%A"] #else: # local_ip = "" if "%D" in data: utime = data["%D"] # stored in milliseconds instead of microseconds? if '.' in utime: utime = int(float(utime) * 1000) else: utime = None r_host = data["%h"] # Resolve the host, take measures to not resolve the same IP twice if not r_host in resolvedIPDict: resolvedIPDict[r_host] = True inputQueue.put(r_host) #r_logname = data["%l"] r_user = data["%u"] req_dt = apachelog.parse_date(data["%t"]) request = data["%r"] status = int(data["%>s"]) if data["%b"] != "-": response_size = int(data["%b"]) else: response_size = 0 referer = data["%{Referer}i"] user_agent = data["%{User-Agent}i"] if "%I" in data: bytes_recv = int(data["%I"]) else: bytes_recv = None if "%O" in data: bytes_sent = int(data["%O"]) else: bytes_sent = None # Build logline logline = u'%010d %s - %s [%s +0000] "%s" %s %s "%s" "%s"' % ( time.mktime(req_dt.timetuple()), r_host, r_user, req_dt.strftime("%d/%b/%Y:%H:%M:%S"), request, status, response_size, referer, user_agent) # If input/output bytes available, append them if bytes_recv and bytes_sent: logline += " %s %s" % ( bytes_recv, bytes_sent, ) logfile.write(logline.encode("utf-8") + "\n") except UnicodeDecodeError: if options.debug: print >> sys.stderr, "UnicodeDecodeError on line %s" % line except apachelog.ApacheLogParserError: if options.debug: print >> sys.stderr, "ApacheLogParserError on line %s" % line except: sys.stderr.write("Unable to parse %s" % line) raise # Delete the processed logfile from the inbox os.unlink(fname) # Sort logfiles by date, strip timestamp field for (vhname, vh) in vhosts.iteritems(): if not vh["logfile"] is None: print "Sorting %s ..." % (vh["logfile"].name, ) # Create output logfile (sorted_logfile_handle, sorted_logfile_name) = mkstemp( prefix=vhname, dir=os.path.join(OUTPUTDIR, "temp")) sorted_logfile = os.fdopen(sorted_logfile_handle, "w+b") # Process input -> output p1 = Popen(["sort", "-n", vh["logfile"].name], stdout=PIPE) p2 = Popen(["cut", "-d ", "-f2-"], stdin=p1.stdout, stdout=sorted_logfile) p1.wait() p2.wait() # Close input (deletes the file) vh["logfile"].close() # Close output and atomically move it into the "pending" directory for further processing sorted_logfile.close() pending_dir = os.path.join(OUTPUTDIR, "pending", vhname) if not os.path.exists(pending_dir): os.makedirs(pending_dir) timestamp = int(time.mktime(datetime.now().timetuple())) os.rename(sorted_logfile_name, os.path.join(pending_dir, str(timestamp) + ".log")) # Wait until all rdns workers are finished exitEvent.set() for worker in workers: worker.join() # Generate DNS cache if len(outputDict) > 0: with open(os.path.join(OUTPUTDIR, "dnscache.txt"), "w+b") as f: for (ip, rdns) in outputDict.iteritems(): f.write("%s %s\n" % (ip, rdns)) # Delete old config files for f in glob("/etc/awstats/awstats.*.conf"): os.unlink(f) # Generate new config files for (vhname, vh) in vhosts.iteritems(): conffile = "/etc/awstats/awstats.%s.conf" % vhname with open(conffile + ".new", "w") as f: logfilesdir = os.path.join(OUTPUTDIR, "pending", vhname, "*.log") f.write('LogFile="/opt/pysk/tools/logfiles/logmerge.py %s |"\n' % logfilesdir) f.write('SiteDomain="%s"\n' % vhname) f.write('HostAliases="www.%s"\n' % vhname) f.write('DirData="/var/lib/awstats/%s/"\n' % vhname) f.write('Include "/etc/awstats/awstats.conf.local"\n') os.rename(conffile + ".new", conffile) # Preprocess pending logfiles before statistics run ## Delete empty logfiles call([ "/usr/bin/find", os.path.join(OUTPUTDIR, "pending"), "-name", "*.log", "-size", "0", "-delete" ]) # Run statistics ## Create list of vhosts which have logfiles vhosts_with_logs = list( set([ os.path.basename(os.path.dirname(i)) for i in glob(os.path.join(OUTPUTDIR, "pending", "*", "*.log")) ])) ## Run awstats for these vhosts for v in vhosts_with_logs: call([ "/usr/local/awstats/wwwroot/cgi-bin/awstats.pl", "-config=%s" % (v, ), "-showcorrupted" ]) # Finalize processed logfiles processed_logfiles = glob( os.path.join(OUTPUTDIR, "processed", "*", "*.log")) ## Compress with bzip2 -9 for pl in processed_logfiles: call(["bzip2", "-9", pl]) # Fix permissions of awstats directory call("chmod 0750 /var/lib/awstats", shell=True) call("chmod 0750 /var/lib/awstats/*", shell=True) call("chmod 0660 /var/lib/awstats/*/*", shell=True) call("chown pysk:http /var/lib/awstats", shell=True) call("chown pysk:http /var/lib/awstats/*", shell=True) call("chown pysk:http /var/lib/awstats/*/*", shell=True) call("find /var/lib/awstats/ -name \"*.tmp.*\" -delete", shell=True)
help="log file we're reading requests from") parser.add_argument('-n', '--number', dest='count', action='store', default=10, type=int, required=False, help="how many urls do we wish to see, default 10") args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() parse = apachelog.parser(LOG_FMT) res = [] with open(args.log) as log: for l in log: try: l = parse.parse(l.strip()) rtime, url = l['%x'], l['%r'] res.append((url, rtime)) except Exception, exc: print str(exc) for r in sorted(res, key=itemgetter(1), reverse=True)[0:args.count]: print "{1} - {0}".format(*r)
# STEP 5.1. read apache log dir for pathApacheLog, subFolders, files in os.walk(pathApacheLog): for file in files: ext = os.path.splitext(pathApacheLog+'/'+file)[1] if ext == '.log': print '[{0}] - pid:{1} - read log-file: {2}'.format(datetime.datetime.now(), readPid, pathApacheLog+'/'+file) currlogfile = pathApacheLog+'/'+file #print pathApacheLog+'/'+file # STEP 5.2. parse log file ############ # Format copied and pasted from Apache conf - use raw string + single quotes format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' p = apachelog.parser(format) # Common Log Format (CLF) #p = apachelog.parser(apachelog.formats['common']) # Common Log Format with Virtual Host #p = apachelog.parser(apachelog.formats['vhcommon']) # NCSA extended/combined log format #p = apachelog.parser(apachelog.formats['extended']) for line in open(currlogfile): try: data = p.parse(line) post_data = {"hash":BADBOT_NODE_HASH, "host":b64encode(data['%h']), "useragent":b64encode(data['%{User-Agent}i']),"time":b64encode(data['%t'])}
def handle(self, *args, **options): if len(args) != 1: print "Missing argument." return min_count = int(args[0]) p = apachelog.parser(logformat) spider = { } for line in sys.stdin: # Parse the acess log line. try: data = p.parse(line) except: continue # Is it a request to a bill page? path = data["%r"].split(" ")[1] m = re_bill.match(path) if not m: continue # Who is the referrer? ref = data["%{Referer}i"] if ref in ("", "-") or "govtrack.us" in ref: continue url = urlparse.urlparse(ref) hostname = url.hostname qs = urlparse.parse_qs(url.query) if not hostname: continue # Filter out known useless domains. if hostname in ("t.co", "longurl.org", "ow.ly", "bit.ly", "www.facebook.com", "www.weblinkvalidator.com", "static.ak.facebook.com", "info.com", "altavista.com", "tumblr.com", "www.freerepublic.com", "www.reddit.com"): continue if hostname.endswith(".ru"): continue # For referrals from Google, look at the 'q' argument to see how # people are searching for this page. if hostname.replace("www.", "").replace("search.", "") in ("google.com", "bing.com", "aol.com", "yahoo.com"): # todo, some use q= some use query= #print qs.get("q", [""])[0] continue # Filter out other domains if the link has a 'q' argument since it's probs # a search engine. if "q" in qs or "pid" in qs: continue # Filter out common paths for message boards. if "/threads/" in ref or "/forum/" in ref or "viewtopic.php" in ref: continue key = (m.groups(), url) spider[key] = spider.get(key, 0) + 1 ### first_print = True spider = spider.items() spider.sort(key = lambda kv : kv[1]) for (bill_info, referral_url), count in spider: if count < min_count: continue # filter out referrers that occurred too infrequently bill_type = BillType.by_slug(bill_info[1]) bill = Bill.objects.get(congress=bill_info[0], bill_type=bill_type, number=bill_info[2]) lnk, is_new = BillLink.objects.get_or_create( bill=bill, url=referral_url.geturl(), defaults={ "title": "Title Not Set" }) # Additional processing for new entries. if not is_new: continue try: stream = urllib.urlopen(referral_url.geturl()) if stream.getcode() != 200: continue dom = lxml.etree.parse(stream, lxml.etree.HTMLParser()) except: continue title = dom.xpath('string(head/title)').strip() if title == "": continue # set the title of the scraped page lnk.title = title # white-list some domains, provided we were able to # get a title if referral_url.hostname in ("en.wikipedia.org", "www.truthorfiction.com", "www.theatlantic.com", "www.snopes.com", "arstechnica.com"): lnk.approved = True else: if first_print: print "Links pending approval:" print first_print = False print referral_url.geturl() print title.encode("utf8") print unicode(bill).encode("utf8") print lnk.save()
""" import sys import os import apachelog import optparse import urlparse import cgi import logging import pdb from datetime import datetime, time, timedelta # Format copied and pasted from Apache conf - use raw string + single quotes apache_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' apache_parser = apachelog.parser(apache_format) logging.basicConfig(level=logging.INFO) def main(): usage = """ %prog WAV_FILENAME LOG_FILENAME [options] """ opt_parser = optparse.OptionParser(usage=usage) opt_parser.add_option( "-d", "--delays", dest="delays_filename", help="An optional argument to" " specify a TSV file which contains the delays for each stage.") opt_parser.add_option("-c",
from __future__ import print_function import sys import json import apachelog import logging from conf import settings logger = logging.getLogger(__name__) parser = apachelog.parser(apachelog.formats['extended']) mappings = settings.APACHE_FIELD_MAPPINGS def record_filter(record): """Return the record if it matches certain filters, otherwise None.""" if not record.get("status") == "200": return None if not record.get("request").startswith("GET"): return None if record.get("ip_address") in ['127.0.0.1', '::1', '18.7.27.25']: return None return record def field_mapper(request, mappings): """Map fields from input request dict to new dict based on mappings.""" new_f = {} for k, v in request.items(): if k in mappings: new_f[mappings[k]] = v
# Author: cytec <*****@*****.**> # URL: http://github.com/cytec/SynoDLNAtrakt/ # # This file is part of SynoDLNAtrakt. import apachelog, os, sys, re accesslog = "logparser.log" p = apachelog.parser(apachelog.formats['lighttpd']) time_format = "[%d/%b/%Y:%H:%M:%S +0200]" regex = ".*(?P<theid>\d{5})\.(?P<ext>\w{3,5})" testlist = ["mkv","mp4","avi"] for line in open(accesslog): try: data = p.parse(line) x = re.match(regex, data["%r"]) if x.group("ext") not in testlist: continue print x.group("theid"),x.group("ext") except: sys.stderr.write("Unable to parse %s" % line) #print "no"
from pymongo import MongoClient import apachelog, sys from dateutil.parser import parse # Format copied and pasted from Apache conf - use raw string + single quotes format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' p = apachelog.parser(format) client = MongoClient('mongodb://localhost:27017/') db = client.testdb for line in open('log/access_log'): data = p.parse(line) if data['%r']: accessed_file = data['%r'].split() if (len(accessed_file) > 2): accessed_file_name = accessed_file[1] else: accessed_file_name = accessed_file[0] else: accessed_file_name = '' apache_date = data['%t'].replace("[", "").replace("]", "") time = parse(apache_date[:11] + " " + apache_date[12:]) log_entry = { "host": data['%h'], "time": time, "path": accessed_file_name, "referer": data["%{Referer}i"], "useragent": data['%{User-Agent}i'] } db.logs.insert_one(log_entry)
if len(sys.argv) != 2: print "Usage: python apache_log_analysis.py <log file>" sys.exit(0) else: apache_log = file(sys.argv[1], 'r').readlines() #http_err=range(400,418,1) #Example log line from the available log ex_url_apache = '134.251.87.133 - - [24/Dec/2014:01:00:10 +0800] "GET /IN/common/print.js HTTP/1.1" 200' nformat_apache = r'%h %l %u %t \"%r\" %>s %b' #ex_url_ISA='104.224.147.94 Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0) https://cards.rblbank.com:443/IN/ShowImage?linkid= 302' #nformat_ISA=r'%h \"%{User-Agent}i\" %>s' p = apachelog.parser(nformat_apache) log_lst = [] for line in apache_log: try: data = p.parse(line) log_lst.append(data['%t'] + " " + data['%b'] + " " + data['%r'] + " " + data['%>s']) except apachelog.ApacheLogParserError: sys.stderr.write('Unable to read line at %s' % line) df = pd.DataFrame(log_lst) ''' del df ['%h'] del df ['%l']
def main(): """ Check files, setup log file object, process line, assemble output file """ anonymize_order = LogFile(**args) source_file = open(anonymize_order.file_input, 'r') destination_file = open(anonymize_order.file_output, anonymize_order.write_mode) parser = apachelog.parser(anonymize_order.log_format) read_line = 1 while read_line: read_line = source_file.readline() try: log_line = parser.parse(read_line) anonymize_order.line_accept() except apachelog.ApacheLogParserError: anonymize_order.line_reject() #pdb.set_trace() hash_keys = ['%h', '%a', '%u', '%l'] #check if value is a - and skip the hashing because it's not needed while hash_keys: hash_key = hash_keys.pop() if hash_key in log_line.keys(): if log_line[hash_key] == '-': pass else: log_line[hash_key] = hashomatic(log_line[hash_key], anonymize_order.hash_salt) #need to reorder the writeline in the original log format #list of the format and prune extra chars for consistency with the apachlog object log_order = anonymize_order.log_format.split() log_order = [format_key.replace('\\', '') for format_key in log_order] log_order = [format_key.replace('"', '') for format_key in log_order] #around here somewhere we need to add quotes to the fields that have them stripped #request, referal, user agent # %r , %{Referer}, %{User-Agent} quote_check_list = ['%r', '%{Referer}i', '%{User-agent}i'] while quote_check_list: quote_check = quote_check_list.pop() if quote_check in log_line.keys(): if log_line[quote_check].startswith('"'): pass else: log_line[quote_check] = '"' + log_line[quote_check] if log_line[quote_check].endswith('"'): pass else: log_line[quote_check] = log_line[quote_check] + '"' write_line = [] for format_key in log_order: write_line.append(log_line[format_key]) write_line.append('\n') write_line = ' '.join(write_line) destination_file.write(write_line) print 'Lines accepted: ' + str(anonymize_order.lines_accepted) print 'Lines rejected: ' + str(anonymize_order.lines_rejected)
def parseLog(logfile, outfile): """Parses apache logs and performs lookup on where the access is from""" #Options LOG_ACCESS_DATA = False PRINT_PROGRESS = True IPINFODB_API_KEY = '' apache_log_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' p = apachelog.parser(apache_log_format) total_lines = str(sum(1 for line in open(logfile))) current_line = 1 parsed_log = {} for line in open(logfile, 'r'): try: data = p.parse(line) ip = data['%h'] if ip not in parsed_log: #new entry parsed_log[ip] = {} parsed_log[ip]['access_count'] = 1 if LOG_ACCESS_DATA: parsed_log[ip].setdefault('accessed_files', []).append((data['%t'], data['%r'])) #convert to date time ipinfo_url = 'http://api.ipinfodb.com/v3/ip-city/?key=' + IPINFODB_API_KEY + '&ip=' + ip + '&format=json' try: req = urllib2.Request(ipinfo_url) response = urllib2.urlopen(req) except URLError as e: print 'TODO' ipinfo = json.load(response) parsed_log[ip]['country'] = ipinfo['countryName'] parsed_log[ip]['city'] = ipinfo['cityName'] parsed_log[ip]['lat'] = ipinfo['latitude'] parsed_log[ip]['lon'] = ipinfo['longitude'] try: parsed_log[ip]['hostname'] = socket.gethostbyaddr(ip)[0] except: parsed_log[ip]['hostname'] = '' if PRINT_PROGRESS: print "INSERTED: " + ip + " LINES: " + str(current_line) + "/" + total_lines else: parsed_log[ip]['access_count'] += 1 if LOG_ACCESS_DATA: parsed_log[ip].setdefault('accessed_files', []).append((data['%t'], data['%r'])) if PRINT_PROGRESS: print "UPDATED: " + ip + " LINES: " + str(current_line) + "/" + total_lines except: print "Parse error: %s" % line current_line += 1 #output file in pretty JSON print with open(outfile, 'w') as output: json.dump(parsed_log, output, sort_keys=True, indent=4, separators=(',', ': '))
#!/usr/bin/python import apachelog, sys urls = {} f1 = apachelog.parser(apachelog.formats['extended']) # Format copied and pasted from Apache conf - use raw string + single quotes format2 = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' #212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1" # 200 2607 "http://peterhi.dyndns.org/bandwidth/index.html" # "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202" format3 = r'%h %l %u %t \"%r\" %>s %b' #10.50.180.180 - - [18/Apr/2012:01:26:10 -0700] "GET /stylesheet/media/bullet1.gif HTTP/1.1" 404 226 #p = apachelog.parser(format) # Common Log Format (CLF) #p = apachelog.parser(apachelog.formats['common']) # Common Log Format with Virtual Host #p = apachelog.parser(apachelog.formats['vhcommon']) # NCSA extended/combined log format f2 = apachelog.parser(format2) f3 = apachelog.parser(format3) for line in open('access_log'): try:
def __init__(self, path=None): super(ApacheLogfileParser, self).__init__(path) log_format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}\i"' self.parser = apachelog.parser(log_format) self.api = API() logging.basicConfig(level=logging.INFO)
def _parsefile(self, logfile_obj): # Create a parser for this file parser = apachelog.parser(self.uasp_format) filename = logfile_obj.file_path + logfile_obj.file_name # Attempt to determine the number of lines in the log log = open(filename) for line in log: self.import_stats['line_count'] += 1 print str(self.import_stats.get('line_count')) + " lines to parse. Beginning at line " + str(self.import_stats.get('import_startline')) + "\n" log.close() log = open(filename) previous_line = "" for line in log: # Update stats self.import_stats['line_counter'] += 1 if self.import_stats.get('line_counter') < self.import_stats.get('import_startline'): # Skip through to the specified line number previous_line = line continue # # Test for duplicate log entries immediately preceding # if line == previous_line: # self._errorlog("##### DUPLICATE LINE DETECTED ##### \n" +\ # "Line # :" + str(self.import_stats.get('line_counter')) + "\n" +\ # "Line : " + str(line) + "\n") # self.import_stats['duplicatecount'] += 1 # else: # # Parse and store the line # Removing the Duplicate line detection as I can't rule out the 0.5-1% of duplicates as being invalid yet: CM 9-12-11 self._parseline(parser, line, logfile_obj) # Print progress report every 500 lines. if (self.import_stats.get('line_counter') % 500) == 0: # Calculate the average rate of import for the whole process try: self.import_stats['import_rate'] = \ float(self.import_stats.get('line_counter') - self.import_stats.get('import_startline')) /\ float((datetime.datetime.utcnow() - self.import_stats.get('import_starttime')).seconds) except ZeroDivisionError: self.import_stats['import_rate'] = 1 # Calculate how long till finished try: efs = int( float(self.import_stats.get('line_count') - self.import_stats.get('line_counter')) /\ float(self.import_stats.get('import_rate')) ) except ZeroDivisionError: efs = 1 efhr = efs // (60*60) efs = efs % (60*60) efmin = efs // 60 efsec = efs % 60 efstring = str(efhr) + "h " + str(efmin) + "m " + str(efsec) + "s." # Output the status print str(datetime.datetime.utcnow()) + ": " +\ str((float(self.import_stats.get('line_counter')) / float(self.import_stats.get('line_count')))*100)[0:5] + "% completed. " +\ "Parsed " + str(self.import_stats.get('line_counter')) + " lines. " +\ "Duplicates: " + str(self.import_stats.get('duplicatecount')) + ". " +\ "Rate: " + str(self.import_stats.get('import_rate'))[0:6] + " lines/sec. " +\ "Est. finish in " + efstring # Write the error cache to disk self._error_log_save() # Update duplicate line string for next pass previous_line = line return None
def main(): import gdw.stats.scripts parseZCML(gdw.stats.scripts, 'scripts.zcml') db = getUtility(IDatabase, 'postgres') session = db.session initialize_declarative_mappers(DeclarativeBase, db.metadata) initialize_defered_mappers(db.metadata) p = apachelog.parser(FORMAT) logfilePath, website = getConfig() maxDate = getMaxDate(website).max_1 if maxDate is None: maxDate = datetime(1970, 1, 1) cpt = 0 for line in open(logfilePath): try: data = p.parse(line) except: continue if data is None: continue date = apachelog.parse_date(data['%t']) date = datetime(*time.strptime(date[0], '%Y%m%d%H%M%S')[:6]) if date <= maxDate: continue code = data['%>s'] if int(code) not in VALID_HTTP_CODE: continue path = re.match('(.*) (.*) (.*)', data['%r']).group(2) path = urlparse.urlparse(path)[2] # path : '/hebergement/logement/beauraing/hebid' if len(path.strip('/').split('/')) != 4: continue if path.lstrip('/').split('/')[0] != 'hebergement': continue if path.endswith('/view'): continue if path.endswith('/gallery'): continue hebid = path.rstrip('/').split('/')[-1] if os.path.splitext(hebid)[1] in SKIP_TYPES: continue if hebid.lower() in ['robots.txt', '/misc_/ExternalEditor/edit_icon']: continue if 'manage_' in hebid.lower(): continue if '/p_/' in path.lower(): continue if '/misc_/' in path.lower(): continue agent = data['%{User-Agent}i'] stop = False agent_lower = agent.lower() for robot in ROBOTS: if robot in agent_lower: stop = True break if stop: continue host = data['%h'] if host in SKIP_HOSTS: continue cpt += 1 if cpt % 10000 == 0: session.flush() session.commit() print cpt heb_pk = getHebPkFromId(hebid) logline = LogItem() logline.log_date = date logline.log_path = path logline.log_hebid = hebid logline.log_hebpk = heb_pk logline.log_host = host logline.log_agent = agent logline.log_website = website session.add(logline) maxDate = date session.flush() session.commit()
def parse_apache_logfile(db_conn, logfile, start_date): # A list of extensions used to decide which files are counted as hits valid_extensions = [ ".tar.gz", ".dmg", ".exe", ".zip", ".tgz", ".tbz", ".bz2" ] format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' p = apachelog.parser(format) num_lines = 0 ignored_old_entries = 0 next_progress = time.time() + 3 for line in logfile: if time.time() > next_progress: next_progress = time.time() + 3 print("line", num_lines) num_lines += 1 try: data = p.parse(line) code = int(data['%>s']) except: code = -1 # Get the code, if it's not 200 (success) we ignore it if code == 200: # Break the request URL into / separated pieces request_data = data['%r'].split() if len(request_data) <= 1: continue request_url = request_data[1] breakdown = request_url.split('/') # If the URL starts with /cig/software or /cig/software/github if len(breakdown) >= 4 and breakdown[1] == 'cig' and breakdown[ 2] == 'software': # Then the third subpath is the code name (fourth if it's a github redirect) if breakdown[3] == 'github': cig_code = breakdown[4] else: cig_code = breakdown[3] # Assume the file name is the final path element url_end = breakdown[-1] valid = False # Confirm that the file name ends with one of the valid extensions for ext in valid_extensions: if url_end.endswith(ext): valid = True break # Get the timestamp of the request timestamp = data['%t'].split()[0] parsed_ts = datetime.datetime.strptime(timestamp, "[%d/%b/%Y:%H:%M:%S") if parsed_ts < start_date: ignored_old_entries += 1 elif valid: # Now we have confirmed this hit is a valid download # First ensure the corresponding filename and package are in the DB add_file_package(db_conn, url_end, cig_code) # First we have to convert the IP address to an IP number host_name = data['%h'] try: ip_addr = socket.gethostbyname(host_name) except: continue # Get the IP address associated with the host name ip_num = ip_addr_to_ip_num(ip_addr) # Then add the hit in the database add_hit(db_conn, url_end, cig_code, ip_num, parsed_ts) return ignored_old_entries
logfiles = os.listdir(LOGS_DIR) for file in logfiles: fileloaded = open(LOGS_DIR+"/"+file, 'r') cf = "country_"+file if proc_files.find({'_id':cf}).count() == 0: pfpost = {} pfpost['proc_date'] = date.isoformat(date.today()) pfpost['_id'] = cf proc_files.insert(pfpost) print "processing "+file count=0 for line in fileloaded: if "GET /scielo.php?script=" in line: count=count+1 p = apachelog.parser(APACHE_LOG_FORMAT) try: data = p.parse(line) except: sys.stderr.write("Unable to parse %s" % line) if MONTH_DICT.has_key(data['%t'][4:7].upper()): month = MONTH_DICT[data['%t'][4:7].upper()] else: continue dat = data['%t'][8:12]+month url = data['%r'].split(' ')[1] ip = data['%h'] i2pc = IP2Country(verbose=False)
inputfn = sys.argv[1] #log_format main '$remote_addr - $remote_user [$time_local] "$request" ' # '$status $body_bytes_sent "$http_referer" ' # '"$http_user_agent" "$http_x_forwarded_for" "$request_time" "$host" "${app_key}-${receiver_type}" "$versionno"' ; #log_format main '$remote_addr - $remote_user [$time_local] "$request" ' # '$status $body_bytes_sent "$http_referer" ' # '"$http_user_agent" "$http_x_forwarded_for" "$upstream_addr"' #format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{X-FORWARDED-FOR}i\" %T %U' format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{X-FORWARDED-FOR}i\" %T' #format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" "%a" %T' format2 = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' format3 = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %a %T %U' #%h %t %>s %b %T p = apachelog.parser(format) p2 = apachelog.parser(format2) p3 = apachelog.parser(format3) if False == os.path.exists(inputfn): print 'input file not exists' sys.exit(0) f = open(inputfn, 'r') while True: line = f.readline() if not line: break try: d = p.parse(line) has_rqstime = 1