Ejemplo n.º 1
0
def apache_2_sql ( logfile ):
	"Convert apache logfile to mysql table"
	
	p = apachelog.parser(apachelog.formats['extended'])
	for line in open( logfile ):
	 try:
	  data = p.parse(line)
	 except:
		sys.stderr.write("Unable to parse %s" % line)
	 converted_date = apachelog.parse_date(data['%t'])
	 converted_request = data['%r'].lower().strip()

	 print data
	return
Ejemplo n.º 2
0
def apache_2_sql(logfile):
    "Convert apache logfile to mysql table"

    p = apachelog.parser(apachelog.formats['extended'])
    for line in open(logfile):
        try:
            data = p.parse(line)
        except:
            sys.stderr.write("Unable to parse %s" % line)
        converted_date = apachelog.parse_date(data['%t'])
        converted_request = data['%r'].lower().strip()

        print data
    return
Ejemplo n.º 3
0
def parse_line(line, log_parser, latency_in_millis=False):
    """
    Convert a line from a log into LogRecord.

    Contains code that parses log records. This code may need to be changed if Apache log format changes.
    @param unicode line: log line to parse
    @param ApacheLogParser log_parser: instance of ApacheLogParser containing log format description
    @param boolean latency_in_millis: if True, latency is considered to be in milliseconds, otherwise in microseconds
    """
    record = log_record.LogRecord()

    try:
        data = log_parser.parse(line)
    except apachelog.ApacheLogParserError:
        logger.warn('Parser has caught an error while processing the following record: ')
        logger.warn(line)
        return None

    try:
        record.time = apachelog.parse_date(data['%t'])
    except (IndexError, KeyError):
        logger.warn('Parser was not able to parse date %s: ' % data['%t'])
        logger.warn('Record with error: %s' % line)
        return None

    record.line = line

    request = data['%r']
    try:
        record.raw_request = request.split(' ')[1]
    except IndexError:
        logger.warn('Parser was not able to parse the request %s: ' % request)
        logger.warn('Record with error: %s' % line)
        return None

    try:
        record.response_code = int(data['%>s'])
    except ValueError:
        logger.warn('Parser was not able to parse response code %s: ' % data['%>s'])
        logger.warn('Record with error: %s' % line)
        return None

    latency = data['%D']
    if latency.find('.') == -1 and latency_in_millis:
        latency += '000'
    record.latency = parse_latency(latency)

    return record
Ejemplo n.º 4
0
def main():
    import gdw.stats.scripts
    parseZCML(gdw.stats.scripts, 'scripts.zcml')
    db = getUtility(IDatabase, 'postgres')
    session = db.session
    initialize_declarative_mappers(DeclarativeBase, db.metadata)
    initialize_defered_mappers(db.metadata)
    p = apachelog.parser(FORMAT)
    logfilePath, website = getConfig()
    maxDate = getMaxDate(website).max_1
    if maxDate is None:
        maxDate = datetime(1970, 1, 1)
    cpt = 0
    for line in open(logfilePath):
        try:
            data = p.parse(line)
        except:
            continue
        if data is None:
            continue
        date = apachelog.parse_date(data['%t'])
        date = datetime(*time.strptime(date[0], '%Y%m%d%H%M%S')[:6])
        if date <= maxDate:
            continue
        code = data['%>s']
        if int(code) not in VALID_HTTP_CODE:
            continue
        path = re.match('(.*) (.*) (.*)', data['%r']).group(2)
        path = urlparse.urlparse(path)[2]
        # path : '/hebergement/logement/beauraing/hebid'
        if len(path.strip('/').split('/')) != 4:
            continue
        if path.lstrip('/').split('/')[0] != 'hebergement':
            continue
        if path.endswith('/view'):
            continue
        if path.endswith('/gallery'):
            continue
        hebid = path.rstrip('/').split('/')[-1]
        if os.path.splitext(hebid)[1] in SKIP_TYPES:
            continue
        if hebid.lower() in ['robots.txt',
                             '/misc_/ExternalEditor/edit_icon']:
            continue
        if 'manage_' in hebid.lower():
            continue
        if '/p_/' in path.lower():
            continue
        if '/misc_/' in path.lower():
            continue
        agent = data['%{User-Agent}i']
        stop = False
        agent_lower = agent.lower()
        for robot in ROBOTS:
            if robot in agent_lower:
                stop = True
                break
        if stop:
            continue
        host = data['%h']
        if host in SKIP_HOSTS:
            continue
        cpt += 1
        if cpt % 10000 == 0:
            session.flush()
            session.commit()
            print cpt
        heb_pk = getHebPkFromId(hebid)
        logline = LogItem()
        logline.log_date = date
        logline.log_path = path
        logline.log_hebid = hebid
        logline.log_hebpk = heb_pk
        logline.log_host = host
        logline.log_agent = agent
        logline.log_website = website
        session.add(logline)
        maxDate = date
    session.flush()
    session.commit()
Ejemplo n.º 5
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = OptionParser()
    parser.add_option(
        "-v",
        "--vhost",
        dest="vhostoverride",
        help="if logfile doesn't include vhost column, override it with this",
        metavar="VHOST")
    parser.add_option("-d",
                      "--debug",
                      action="store_true",
                      dest="debug",
                      help="Turn on debugging output",
                      default=False)
    (options, args) = parser.parse_args(argv)

    if options.debug:
        print >> sys.stderr, "Debug mode activated"

    OUTPUTDIR = "/opt/pysk/wwwlogs"

    # Get list of vhosts
    db = psycopg2.connect(
        "host='localhost' user='******' password='******' dbname='pysk'"
    )
    cursor = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    query = "SELECT trim(both '.' from vh.name || '.' || d.name) as vhost FROM vps_virtualhost vh, vps_domain d WHERE vh.domain_id = d.id ORDER BY vhost"
    cursor.execute(query)
    rows = cursor.fetchall()

    vhosts = {}
    for row in rows:
        vhosts[row["vhost"]] = {"logfile": None}

    # RDNS
    exitEvent = threading.Event()
    inputQueue = Queue.Queue()
    resolvedIPDict = {}
    outputDict = {}
    workers = [
        ResolveThread(inputQueue, outputDict, exitEvent)
        for i in range(0, 100)
    ]
    for worker in workers:
        worker.start()

    # Log formats
    p_igowo = apachelog.parser(apachelog.formats["igowo"])
    p_vhextendedio = apachelog.parser(apachelog.formats["vhextendedio"])
    p_vhextended = apachelog.parser(apachelog.formats["vhextended"])
    p_extended = apachelog.parser(apachelog.formats["extended"])

    for fname in glob(os.path.join(OUTPUTDIR, "inbox") + "/*"):
        fname = os.path.realpath(fname)
        print "Processing %s ..." % (fname, )
        with open(fname, "rb") as f:
            for line in f:
                # Try to parse line
                try:
                    try:
                        data = p_igowo.parse(line)
                    except apachelog.ApacheLogParserError:
                        try:
                            data = p_vhextendedio.parse(line)
                        except apachelog.ApacheLogParserError:
                            try:
                                data = p_vhextended.parse(line)
                            except apachelog.ApacheLogParserError:
                                if options.vhostoverride:
                                    data = p_extended.parse(line)
                                    data["%v"] = options.vhostoverride
                                else:
                                    raise

                    if cursor != None:
                        vhost = data["%v"]

                        if not vhost in vhosts:
                            continue

                        # Create a new logfile if we don't already have done so
                        if vhosts[vhost]["logfile"] is None:
                            vhosts[vhost]["logfile"] = NamedTemporaryFile(
                                prefix=vhost,
                                dir=os.path.join(OUTPUTDIR, "temp"))
                        logfile = vhosts[vhost]["logfile"]

                        #if "%A" in data:
                        #    local_ip = data["%A"]
                        #else:
                        #    local_ip = ""

                        if "%D" in data:
                            utime = data["%D"]
                            # stored in milliseconds instead of microseconds?
                            if '.' in utime:
                                utime = int(float(utime) * 1000)
                        else:
                            utime = None
                        r_host = data["%h"]

                        # Resolve the host, take measures to not resolve the same IP twice
                        if not r_host in resolvedIPDict:
                            resolvedIPDict[r_host] = True
                            inputQueue.put(r_host)

                        #r_logname = data["%l"]
                        r_user = data["%u"]
                        req_dt = apachelog.parse_date(data["%t"])

                        request = data["%r"]
                        status = int(data["%>s"])

                        if data["%b"] != "-":
                            response_size = int(data["%b"])
                        else:
                            response_size = 0

                        referer = data["%{Referer}i"]
                        user_agent = data["%{User-Agent}i"]

                        if "%I" in data:
                            bytes_recv = int(data["%I"])
                        else:
                            bytes_recv = None

                        if "%O" in data:
                            bytes_sent = int(data["%O"])
                        else:
                            bytes_sent = None

                        # Build logline
                        logline = u'%010d %s - %s [%s +0000] "%s" %s %s "%s" "%s"' % (
                            time.mktime(req_dt.timetuple()), r_host, r_user,
                            req_dt.strftime("%d/%b/%Y:%H:%M:%S"), request,
                            status, response_size, referer, user_agent)
                        # If input/output bytes available, append them
                        if bytes_recv and bytes_sent:
                            logline += " %s %s" % (
                                bytes_recv,
                                bytes_sent,
                            )

                        logfile.write(logline.encode("utf-8") + "\n")
                except UnicodeDecodeError:
                    if options.debug:
                        print >> sys.stderr, "UnicodeDecodeError on line %s" % line
                except apachelog.ApacheLogParserError:
                    if options.debug:
                        print >> sys.stderr, "ApacheLogParserError on line %s" % line
                except:
                    sys.stderr.write("Unable to parse %s" % line)
                    raise
        # Delete the processed logfile from the inbox
        os.unlink(fname)

    # Sort logfiles by date, strip timestamp field
    for (vhname, vh) in vhosts.iteritems():
        if not vh["logfile"] is None:
            print "Sorting %s ..." % (vh["logfile"].name, )
            # Create output logfile
            (sorted_logfile_handle, sorted_logfile_name) = mkstemp(
                prefix=vhname, dir=os.path.join(OUTPUTDIR, "temp"))
            sorted_logfile = os.fdopen(sorted_logfile_handle, "w+b")

            # Process input -> output
            p1 = Popen(["sort", "-n", vh["logfile"].name], stdout=PIPE)
            p2 = Popen(["cut", "-d ", "-f2-"],
                       stdin=p1.stdout,
                       stdout=sorted_logfile)
            p1.wait()
            p2.wait()

            # Close input (deletes the file)
            vh["logfile"].close()

            # Close output and atomically move it into the "pending" directory for further processing
            sorted_logfile.close()
            pending_dir = os.path.join(OUTPUTDIR, "pending", vhname)
            if not os.path.exists(pending_dir):
                os.makedirs(pending_dir)
            timestamp = int(time.mktime(datetime.now().timetuple()))
            os.rename(sorted_logfile_name,
                      os.path.join(pending_dir,
                                   str(timestamp) + ".log"))

    # Wait until all rdns workers are finished
    exitEvent.set()
    for worker in workers:
        worker.join()

    # Generate DNS cache
    if len(outputDict) > 0:
        with open(os.path.join(OUTPUTDIR, "dnscache.txt"), "w+b") as f:
            for (ip, rdns) in outputDict.iteritems():
                f.write("%s %s\n" % (ip, rdns))

    # Delete old config files
    for f in glob("/etc/awstats/awstats.*.conf"):
        os.unlink(f)

    # Generate new config files
    for (vhname, vh) in vhosts.iteritems():
        conffile = "/etc/awstats/awstats.%s.conf" % vhname
        with open(conffile + ".new", "w") as f:
            logfilesdir = os.path.join(OUTPUTDIR, "pending", vhname, "*.log")
            f.write('LogFile="/opt/pysk/tools/logfiles/logmerge.py %s |"\n' %
                    logfilesdir)
            f.write('SiteDomain="%s"\n' % vhname)
            f.write('HostAliases="www.%s"\n' % vhname)
            f.write('DirData="/var/lib/awstats/%s/"\n' % vhname)
            f.write('Include "/etc/awstats/awstats.conf.local"\n')
        os.rename(conffile + ".new", conffile)

    # Preprocess pending logfiles before statistics run

    ## Delete empty logfiles
    call([
        "/usr/bin/find",
        os.path.join(OUTPUTDIR, "pending"), "-name", "*.log", "-size", "0",
        "-delete"
    ])

    # Run statistics

    ## Create list of vhosts which have logfiles
    vhosts_with_logs = list(
        set([
            os.path.basename(os.path.dirname(i))
            for i in glob(os.path.join(OUTPUTDIR, "pending", "*", "*.log"))
        ]))

    ## Run awstats for these vhosts
    for v in vhosts_with_logs:
        call([
            "/usr/local/awstats/wwwroot/cgi-bin/awstats.pl",
            "-config=%s" % (v, ), "-showcorrupted"
        ])

    # Finalize processed logfiles
    processed_logfiles = glob(
        os.path.join(OUTPUTDIR, "processed", "*", "*.log"))

    ## Compress with bzip2 -9
    for pl in processed_logfiles:
        call(["bzip2", "-9", pl])

    # Fix permissions of awstats directory
    call("chmod 0750 /var/lib/awstats", shell=True)
    call("chmod 0750 /var/lib/awstats/*", shell=True)
    call("chmod 0660 /var/lib/awstats/*/*", shell=True)
    call("chown pysk:http /var/lib/awstats", shell=True)
    call("chown pysk:http /var/lib/awstats/*", shell=True)
    call("chown pysk:http /var/lib/awstats/*/*", shell=True)
    call("find /var/lib/awstats/ -name \"*.tmp.*\" -delete", shell=True)
Ejemplo n.º 6
0
        help = 'Include entries starting from time expressed as YYYYMMDDhhmmss')
    oparser.add_option('-e', '--endtime', 
        action='store', dest='endtime', default='99999999999999', type='string',
        help = 'Include entries ending in time expressed as YYYYMMDDhhmmss')

    options, args = oparser.parse_args()
    return options, args


if __name__ == '__main__':
    options, args = initialize()
    p = apachelog.parser(options.logformat)

    for line in open(options.inputfile):
        try:
            data = p.parse(line)
        except:
            sys.stderr.write("Unable to parse %s" % line)

        datestr, offset = apachelog.parse_date(data['%t'])
        if ((datestr > options.starttime) and (datestr < options.endtime)):
            minute = datestr[:-2]
            try:
                minutes[minute] += 1
            except KeyError:
                minutes[minute] = 1

    for k, v in sorted(minutes.iteritems()):
	# print ISO8601 format date
	print "%s-%s-%sT%s:%s %s" % (k[:4], k[4:6], k[6:8], k[8:10], k[10:12], v)