Exemple #1
0
  def _summarise(self, stats):
    debug("LOGREADER", 1, "producing global summaries")
    timebins = sorted(set(stats.keys())) + ["TOTAL"]
    svcs = set(s.service for data in stats.values() for s in data.keys())
    svckeys = [StatsKey(x) for x in ["ALL"] + sorted(svcs)]

    for tb in timebins:
      for svc in svckeys:
        if tb not in stats:
	  stats[tb] = {}
	if svc not in stats[tb]:
	  stats[tb][svc] = StatsData()

    allkey = StatsKey("ALL")
    for timebin, data in stats.iteritems():
      for s, v in data.iteritems():
	svckey = StatsKey(s.service)
        if timebin != "TOTAL" and s.service != "ALL" and svckey != s:
	  if s not in stats["TOTAL"]:
            stats["TOTAL"][s] = StatsData()
	  stats["TOTAL"][s] += v
	  stats["TOTAL"][allkey] += v
	  stats["TOTAL"][svckey] += v
	  stats[timebin][allkey] += v
	  stats[timebin][svckey] += v

    return timebins, svckeys
Exemple #2
0
  def _resolve(self, stats):
    debug("LOGREADER", 1, "received all results, resolving addresses")
    addrs = set()
    for daydata in stats.values():
      for stat in daydata.values():
        addrs.update(ip for ip in stat.ips.keys())

    self.qresolver.put(addrs)
    self.qresolver.put(None)
    self.resolver.join()
    self.resolver = None
    self.qresolver = None
    ipcache = self.ip2i.query()

    debug("LOGREADER", 1, "remapping ip addresses to detailed info")
    for daydata in stats.values():
      for stat in daydata.values():
        for ip, v in stat.ips.iteritems():
          ipi = ipcache[ip]
          for m, val in ((stat.hosts, ipi.hostname),
                         (stat.domains, ipi.domain),
                         (stat.countries,
                          ipi.geoip.country or ipi.asn.country),
                         (stat.locations,
                          ", ".join(x for x in
                                    (ipi.geoip.country,
                                     ipi.geoip.region,
                                     ipi.geoip.city)
                                    if x != None))):
            if val not in m:
              m[val] = StatsByteTiming()
            m[val] += v
Exemple #3
0
    def _submit_ptr(self, callback, ip, type=rr.PTR):
        """Submit a DNS reverse lookup query, but avoid making duplicates.

    If there is already a query ongoing for the given reverse IP address,
    no new query is submitted but the address is cached in `self.ptrmap`
    so the callback knows to consider this new association. The purpose
    is to help caller make multiple reverse IP address queries for a
    given "destination" address, and avoid any excess ones when making
    network address scans for multiple "origin" address.

    Addresses which fail reverse lookup with permanent error such as
    NXDOMAIN are remembered. Future queries on those addresses are
    short-circuited and immediately invoke the `callback` without
    issuing a new DNS query.

    @param callback -- function to call back if the query is answered
    @param ip -- the reversed IP address to look up
    @param info -- the forward IP address this query updates
    @param type -- request record type (adns.rr.*).
    """
        # If known to fail, skip.
        if ip in self.ptrfail:
            callback(None)
            return

        # Add to pending list of ptr lookups
        if ip not in self.ptrmap:
            self.ptrmap[ip] = []
        self.ptrmap[ip].append(callback)

        # Create DNS query if this is first pending lookup for this address.
        if len(self.ptrmap[ip]) == 1:
            debug("IP2INFO", 3, "submitting ptr lookup of %s", ip)
            self._submit(ip.reverse_dns, type, self._ptr_result, ip)
Exemple #4
0
  def run(self):
    # Out of date, (re)generate the statistics. Resolve IP addresses.
    stats = self._parse()

    # Resolve IP addresses and close resolver.
    self._resolve(stats)

    # Produce summary.
    timebins, svckeys = self._summarise(stats)

    # Create stats directory for detailed results.
    statdir = self.statfile.replace(".txt", "")
    if not os.path.exists(statdir + "-yml"):
      os.mkdir(statdir + "-yml")
    if not os.path.exists(statdir + "-json"):
      os.mkdir(statdir + "-json")

    # Save the summary.
    self._save_text(timebins, svckeys, stats, statdir)
    self._save_yaml(timebins, svckeys, stats, statdir)
    self._save_json(timebins, svckeys, stats, statdir)

    # Update database stamp.
    open(self.dbfile, "w").write(self.mystamp)

    # Report resolver statistics.
    debug("LOGREADER", 1, "resolver stats:")
    resstat = self.ip2i.statistics()
    for key in sorted(resstat.keys()):
      status, name = key
      debug("LOGREADER", 1, " %7d %s (%s)", resstat[key], name, status)

    # Tell master we finished
    self.completed_all.put(None)
Exemple #5
0
    def run(self):
        # Out of date, (re)generate the statistics. Resolve IP addresses.
        stats = self._parse()

        # Resolve IP addresses and close resolver.
        self._resolve(stats)

        # Produce summary.
        timebins, svckeys = self._summarise(stats)

        # Create stats directory for detailed results.
        statdir = self.statfile.replace(".txt", "")
        if not os.path.exists(statdir + "-yml"):
            os.mkdir(statdir + "-yml")
        if not os.path.exists(statdir + "-json"):
            os.mkdir(statdir + "-json")

        # Save the summary.
        self._save_text(timebins, svckeys, stats, statdir)
        self._save_yaml(timebins, svckeys, stats, statdir)
        self._save_json(timebins, svckeys, stats, statdir)

        # Update database stamp.
        open(self.dbfile, "w").write(self.mystamp)

        # Report resolver statistics.
        debug("LOGREADER", 1, "resolver stats:")
        resstat = self.ip2i.statistics()
        for key in sorted(resstat.keys()):
            status, name = key
            debug("LOGREADER", 1, " %7d %s (%s)", resstat[key], name, status)

        # Tell master we finished
        self.completed_all.put(None)
Exemple #6
0
    def _resolve(self, stats):
        debug("LOGREADER", 1, "received all results, resolving addresses")
        addrs = set()
        for daydata in stats.values():
            for stat in daydata.values():
                addrs.update(ip for ip in stat.ips.keys())

        self.qresolver.put(addrs)
        self.qresolver.put(None)
        self.resolver.join()
        self.resolver = None
        self.qresolver = None
        ipcache = self.ip2i.query()

        debug("LOGREADER", 1, "remapping ip addresses to detailed info")
        for daydata in stats.values():
            for stat in daydata.values():
                for ip, v in stat.ips.iteritems():
                    ipi = ipcache[ip]
                    for m, val in ((stat.hosts, ipi.hostname), (stat.domains,
                                                                ipi.domain),
                                   (stat.countries, ipi.geoip.country
                                    or ipi.asn.country),
                                   (stat.locations,
                                    ", ".join(x for x in (ipi.geoip.country,
                                                          ipi.geoip.region,
                                                          ipi.geoip.city)
                                              if x != None))):
                        if val not in m:
                            m[val] = StatsByteTiming()
                        m[val] += v
Exemple #7
0
    def _summarise(self, stats):
        debug("LOGREADER", 1, "producing global summaries")
        timebins = sorted(set(stats.keys())) + ["TOTAL"]
        svcs = set(s.service for data in stats.values() for s in data.keys())
        svckeys = [StatsKey(x) for x in ["ALL"] + sorted(svcs)]

        for tb in timebins:
            for svc in svckeys:
                if tb not in stats:
                    stats[tb] = {}
                if svc not in stats[tb]:
                    stats[tb][svc] = StatsData()

        allkey = StatsKey("ALL")
        for timebin, data in stats.iteritems():
            for s, v in data.iteritems():
                svckey = StatsKey(s.service)
                if timebin != "TOTAL" and s.service != "ALL" and svckey != s:
                    if s not in stats["TOTAL"]:
                        stats["TOTAL"][s] = StatsData()
                    stats["TOTAL"][s] += v
                    stats["TOTAL"][allkey] += v
                    stats["TOTAL"][svckey] += v
                    stats[timebin][allkey] += v
                    stats[timebin][svckey] += v

        return timebins, svckeys
Exemple #8
0
    def _load(self):
        """Read the public suffix list.

    Reloads the database from upstream database if there is no locally
    cached file, or the file is too old. Saves the cached data in YAML
    format, with punycoded names.  If the cache file exists and is
    valid, it is loaded as is.

    Please note that py2-yaml used for reading and writing YAML should
    have been built with libyaml for this to perform optimally.
    """
        if not os.path.exists(self.path) \
           or time() - os.stat(self.path)[ST_MTIME] >= 15 * 86400:
            debug("PSL", 1, "downloading fresh psl database")
            self._parse(urllib2.urlopen(self.url))
            newpath = self.path + ".new"
            newfile = open(newpath, "w")
            newfile.write(yaml.dump(self.psl, Dumper=YAMLDumper))
            newfile.close()
            if os.path.exists(self.path):
                os.remove(self.path)
            os.rename(newpath, self.path)
        else:
            debug("PSL", 1, "reading psl database %s", self.path)
            self.psl = yaml.load(open(self.path).read(), YAMLLoader)
Exemple #9
0
    def process(self, waittime=1):
        """Process DNS queries and callbacks for up to `waittime` seconds."""

        # Wait for any queries to complete. Issue any new tasks created.
        now = time()
        npending = self._issue()
        num = len(self.queries)
        until = now + waittime
        prevtime = now
        while len(self.queries):
            ndone = 0
            for q in self.res.completed(.25):
                (created, callback, args) = self.queries[q]
                del self.queries[q]
                callback(q.check(), *args)
                ndone += 1

            if ndone > 0:
                npending = self._issue()

            # See if we should quit. Throttle back if 'completed()' returned
            # quickly and we are busy looping.
            xnow = time()
            if xnow > until:
                break
            if xnow - prevtime < 0.1 and (npending or len(self.queries)):
                sleep(min(0.5, until - xnow))
            prevtime = xnow

        # Expire whatever was running too long, and report timing.
        self._timeout()
        debug("IP2INFO", 2, "processed %d dns queries in %.2f s,"
              " %d remain, %d pending", num,
              time() - now, len(self.queries), npending)
        return npending
Exemple #10
0
    def _asn_lookup_cc(self, info, task, tasks):
        """Perform final step of AS lookups, verifying country code for the
    autonomous system from cymru.com database using DNS lookups. This
    is more accurate than the code returned by initial AS lookups."""
        # Double check ASN lookup was really successful.
        if not info.asn.asn or not info.asn.asn.isdigit():
            return True

        # Define responder to country code lookup from cymru.com. Expects
        # 1-tuple answer matching RX_ASN_CC. Parse the one reply received.
        def responder(answer, addr):
            debug("IP2INFO", 3, "cc result %s %s %s", addr, info.asn.asn,
                  answer)
            self._tick_stats(answer[0])
            task.ongoing = False
            if len(answer[3]) > 0 and len(answer[3][0]) == 1:
                m = RX_ASN_CC.match(answer[3][0][0])
                if m and m.group(1) == info.asn.asn:
                    debug("IP2INFO", 2, "cc assigning %s = %s", m.group(1),
                          m.group(2))
                    info.asn.cc = m.group(2)
                    info.asn.rir = m.group(3)
                    info.asn.date = m.group(4)
                    info.asn.org = m.group(5)
                    info.asn.desc = m.group(7)
                    task.done = True

        debug("IP2INFO", 3, "submitting asn lookup %s", info.asn.asn)
        self._submit("as%s.asn.cymru.com" % info.asn.asn, rr.TXT, responder)
        return False
Exemple #11
0
 def responder(hostname):
     debug("IP2INFO", 3, "cidr result %s %s", info.ip, hostname)
     task.ongoing = False
     if hostname != None:
         task.done = True
         info.cidrhost = hostname
         debug("IP2INFO", 2, "cidr hostname found %s: %s", info.ip,
               info.cidrhost)
Exemple #12
0
    def submit(self, iplist):
        """Submit IP addresses for query.

    Begins issuing queries for IP addresses in the input in parallel.
    Results will be accumulated in an internal table for later access.

    Resolving an address may involve the following queries:
    - GeoIP database lookup
    - IP address to name reverse lookup: maxtries
    - IP address to ASN reverse lookup: maxtries
    - ASN country code lookup: 2 tries
    - CIDR base address lookup: 1 try
    - Nearby address scan: 1 try.

    The resolver issues background queries for addresses in parallel.
    It is best if the client feeds addresses to resolve regularly and
    calls `process()` regularly and frequently enough to collect
    results and to ensure continuous progress and for a query query
    pattern which generally yields good results and avoids excess
    dropped DNS packets and is resilient to sporadic failures.

    @param iplist -- Some iterable of IP addresses, either strings or
    IPAddress objects.

    @return No return value. Results can be retrieved with `query()`.
    """
        now = time()
        numreq = 0
        newreq = 0
        for ip in iplist:
            numreq += 1
            if isinstance(ip, str):
                ip = IPAddress(ip)

            ipstr = str(ip)
            if ipstr not in self.ipaddrs:
                tasks = [
                    IPTask(self._geoip_lookup, 1),
                    IPTask(self._asn_lookup_1, 1),
                    IPTask(self._hostname_lookup, self.maxtries),
                    IPTask(self._asn_lookup_2, self.maxtries),
                    IPTask(self._asn_lookup_3, self.maxtries),
                    IPTask(self._cidr_lookup, 1),
                    IPTask(self._wild_lookup, 1),
                    IPTask(self._domain_lookup, 1)
                ]
                info = IPInfo(ip)
                info.asn = self.NULL_ASN
                info.geoip = GeoIPLookup.NULL_GEOIP
                self.ipaddrs[ipstr] = (info, tasks)
                newreq += 1

        self._issue()
        debug("IP2INFO", 1, "submitted %d addresses, %d new in %.2f s", numreq,
              newreq,
              time() - now)
        return
Exemple #13
0
 def _submit(self, addr, type, callback, *extra):
     """Submit a DNS query.
 @param addr -- the address to look up
 @param type -- request record type (adns.rr.*)
 @param callback -- function to call back if the query is answered
 @param extra -- additional arguments to `callback`.
 """
     debug("IP2INFO", 3, "submitting lookup of %s, type %d", addr, type)
     self.queries[self.res.submit(addr, type)] = \
         (time(), callback, (addr,) + extra)
Exemple #14
0
 def _ptr_result(self, answer, addr, ip):
     """Respond to PTR query results."""
     debug("IP2INFO", 3, "ptr result %s %s %s", addr, ip, answer)
     self._tick_stats(answer[0])
     if answer[0] > adns.status.max_tempfail:
         # permanent failure, remember not to ask again
         debug("IP2INFO", 3, "blacklisting %s %s (%d)", ip,
               _adns_status_name_of(answer[0]), answer[0])
         self.ptrfail.add(ip)
     hostname = (len(answer[3]) > 0 and answer[3][0].lower()) or None
     for callback in self.ptrmap[ip]:
         callback(hostname)
     del self.ptrmap[ip]
Exemple #15
0
    def _parse(self):
        # Start resolver.
        self.resolver.start()

        # Build prototype stats result, with a bin for every time unit.
        stats = dict((strftime(self.time_format, gmtime(t)), {})
                     for t in xrange(self.lim[0], self.lim[1], self.time_unit))

        # Parse and resolve logs.
        qtasks = Queue()
        qresult = Queue()
        nproc = max(2, cpu_count() - 1)
        xprocs = xrange(nproc)
        procs = [
            LogParser(cpu, self.rx, self.parser, qtasks, qresult,
                      self.qresolver, self.lim[0], self.lim[1],
                      self.time_format) for cpu in xprocs
        ]

        map(lambda log: qtasks.put((log[6], log[2])), self.logs)
        map(lambda i: qtasks.put(None), xprocs)
        map(lambda p: p.start(), procs)

        # Wait for results.
        while nproc:
            procid, result = qresult.get()
            if result == None:
                nproc -= 1
                procs[procid].join()
                debug("LOGREADER", 1, "joined process #%d, %d still running",
                      procid, nproc)
                continue

            nsvcs = 0
            for timebin, data in result.iteritems():
                assert timebin in stats, "Unexpected %s result" % timebin
                for sk, sd in data.iteritems():
                    if sk not in stats[timebin]:
                        stats[timebin][sk] = StatsData()
                    stats[timebin][sk] += sd
                    nsvcs += 1
            debug(
                "LOGREADER", 1,
                "merged stats from process #%d, %d stats in %d timebins: %s",
                procid, nsvcs, len(result), " ".join(sorted(result.keys())))

        # Tell master we finished parse so it can start another aggregator.
        self.completed_parse.put(None)

        # Return stats.
        return stats
Exemple #16
0
  def run(self):
    try:
      while True:
        task = self.queue.get()
        if task == None:
          return

        debug("LOGREADER", 1, "#%d processing %s", self.id, " ".join(task))
        stats = self.parser(log_file_parse(self.rx, *task), *self.args)
        self.qresult.put((self.id, stats))

    finally:
      # Send result objects.
      self.qresult.put((self.id, None))
Exemple #17
0
  def _parse(self):
    # Start resolver.
    self.resolver.start()

    # Build prototype stats result, with a bin for every time unit.
    stats = dict((strftime(self.time_format, gmtime(t)), {})
                 for t in xrange(self.lim[0], self.lim[1], self.time_unit))

    # Parse and resolve logs.
    qtasks = Queue()
    qresult = Queue()
    nproc = max(2, cpu_count()-1)
    xprocs = xrange(nproc)
    procs = [LogParser(cpu, self.rx, self.parser, qtasks, qresult,
                       self.qresolver, self.lim[0], self.lim[1],
                       self.time_format)
             for cpu in xprocs]

    map(lambda log: qtasks.put((log[6], log[2])), self.logs)
    map(lambda i: qtasks.put(None), xprocs)
    map(lambda p: p.start(), procs)

    # Wait for results.
    while nproc:
      procid, result = qresult.get()
      if result == None:
        nproc -= 1
        procs[procid].join()
        debug("LOGREADER", 1, "joined process #%d, %d still running",
              procid, nproc)
        continue

      nsvcs = 0
      for timebin, data in result.iteritems():
        assert timebin in stats, "Unexpected %s result" % timebin
        for sk, sd in data.iteritems():
          if sk not in stats[timebin]:
            stats[timebin][sk] = StatsData()
          stats[timebin][sk] += sd
          nsvcs += 1
      debug("LOGREADER", 1,
            "merged stats from process #%d, %d stats in %d timebins: %s",
            procid, nsvcs, len(result), " ".join(sorted(result.keys())))

    # Tell master we finished parse so it can start another aggregator.
    self.completed_parse.put(None)

    # Return stats.
    return stats
Exemple #18
0
    def run(self):
        try:
            while True:
                task = self.queue.get()
                if task == None:
                    return

                debug("LOGREADER", 1, "#%d processing %s", self.id,
                      " ".join(task))
                stats = self.parser(log_file_parse(self.rx, *task), *self.args)
                self.qresult.put((self.id, stats))

        finally:
            # Send result objects.
            self.qresult.put((self.id, None))
Exemple #19
0
 def responder(answer, addr):
     debug("IP2INFO", 3, "cc result %s %s %s", addr, info.asn.asn,
           answer)
     self._tick_stats(answer[0])
     task.ongoing = False
     if len(answer[3]) > 0 and len(answer[3][0]) == 1:
         m = RX_ASN_CC.match(answer[3][0][0])
         if m and m.group(1) == info.asn.asn:
             debug("IP2INFO", 2, "cc assigning %s = %s", m.group(1),
                   m.group(2))
             info.asn.cc = m.group(2)
             info.asn.rir = m.group(3)
             info.asn.date = m.group(4)
             info.asn.org = m.group(5)
             info.asn.desc = m.group(7)
             task.done = True
Exemple #20
0
    def _timeout(self):
        """Cancel queries which have timed out after `self.maxtime`."""

        # Scan for queries which have been going on for too long.
        now = time()
        expired = []
        for q, info in self.queries.iteritems():
            if now > info[0] + self.maxtime:
                expired.append((q, info))

        # Now expire them. Call the callbacks so the tasks move on.
        debug("IP2INFO", 3, "cancelling %d timed out queries", len(expired))
        for q, info in expired:
            (created, callback, args) = info
            del self.queries[q]
            q.cancel()
            callback((-1, None, None, tuple()), *args)
Exemple #21
0
    def _asn_lookup_3(self, info, task, tasks):
        """Perform third step of AS lookups for IP addresses by using
    cymru.com reverse mapping DNS servers."""
        # Report ready if we already have a result.
        if info.asn != self.NULL_ASN:
            return True

        # Define responder to cymru.com ASN lookup query.  Expects 1-tuple
        # "ASN | CIDR | CC | RIR | YYYY-MM-DD" answer. Keeps the last
        # record of the answer received, it's the most specific CIDR. If
        # this creates ASInfo it will request ASN cc lookup too.
        def responder(answer, addr):
            debug("IP2INFO", 3, "cymru result %s %s %s", addr, info.ip, answer)
            self._tick_stats(answer[0])
            task.ongoing = False
            if len(answer[3]) > 0 and len(answer[3][-1]) == 1:
                m = RX_ASN.match(answer[3][-1][0])
                if m:
                    task.done = True
                    if m.group(1) in self.asnmap:
                        debug("IP2INFO", 3, "cymru existing asn %s",
                              m.group(1))
                        info.asn = self.asnmap[m.group(1)]
                    else:
                        debug("IP2INFO", 2, "cymru new asn %s, cidr %s, cc %s",
                              m.group(1), m.group(2), m.group(3))
                        tasks.insert(1, IPTask(self._asn_lookup_cc, 2))
                        info.asn = self.asnmap[m.group(1)] = \
                            ASInfo(asn = m.group(1),
                                   cc = m.group(3),
                                   rir = m.group(4),
                                   date = m.group(5))

        # Do reverse TXT lookup on IP address from cymru.com DNS.
        revaddr = info.ip.reverse_dns
        if revaddr.endswith(".in-addr.arpa."):
            rev = revaddr[:-14] + ".origin.asn.cymru.com"
        elif revaddr.endswith(".ip6.arpa."):
            rev = revaddr[:-10] + ".origin6.asn.cymru.com"
        else:
            assert False, "reverse address %s not recognised" % revaddr
        debug("IP2INFO", 3, "submitting asn lookup %s", rev)
        self._submit(rev, rr.TXT, responder)
        return False
Exemple #22
0
  def _save_text(self, timebins, svckeys, stats, statdir):
    debug("LOGREADER", 1, "saving text summary")

    def underline(svc, line):
      return "  " + svc.service + line[len(svc.service):]

    def format(v):
      return ((" %10d" + " %10.1f" * 8 + " %7d" * 9) %
              (v.total.timing.num,
               v.total.timing.avg, v.total.timing.rms,
               v.total.timing.min, v.total.timing.max,
               v.total.bytes.avg, v.total.bytes.rms,
               v.total.bytes.min, v.total.bytes.max,
               len(v.methods), len(v.codes),
               len(v.users), len(v.browsers),
               len(v.ips), len(v.hosts), len(v.domains),
               len(v.countries), len(v.locations)))

    result = StringIO()
    titles1 = ("#request",
               "dt avg", "dt rms", "dt min", "dt max",
               "out avg", "out rms", "out min", "out max")
    titles2 = ("#method", "#code", "#user", "#brwser",
               "#ip", "#host", "#domain",
               "#c'ntry", "#loc'n")
    line = "_" * (11 * len(titles1) + 8 * len(titles2) - 2)

    result.write(" " * 11)
    result.writelines(underline(svc, line) for svc in svckeys)
    result.write("\n")
    result.write("%-10s" % "Date")
    result.writelines((" %10s" * len(titles1) + " %7s" * len(titles2))
                      % (titles1 + titles2) for svc in svckeys)
    result.write("\n")

    # Now add stats per service for each time bin.
    for timebin in timebins:
      result.write("%-10s" % timebin)
      result.writelines(format(stats[timebin][svc]) for svc in svckeys)
      result.write("\n")

    # Write it out.
    open(self.statfile, "w").write(result.getvalue())
Exemple #23
0
    def _asn_lookup_2(self, info, task, tasks):
        """Perform second step of AS lookups for IP addresses by using
    routeviews.org reverse mapping DNS servers."""
        # Report ready if we already have a result.
        if info.asn != self.NULL_ASN:
            return True

        # Define responder to routeviews.org ASN lookup query.  Expects
        # TXT 3-tuple (name, cidr, width) answer. Keeps the first answer
        # received in the answer record. If this creates ASInfo it will
        # request ASN cc lookup too.
        def responder(answer, addr):
            debug("IP2INFO", 3, "routeviews result %s %s %s", addr, info.ip,
                  answer)
            self._tick_stats(answer[0])
            task.ongoing = False
            if len(answer[3]) > 0 and len(answer[3][0]) == 3:
                task.done = True
                (asn, cidr, w) = answer[3][0]
                info.cidr = IPNetwork("%s/%s" % (cidr, w))
                if asn in self.asnmap:
                    debug("IP2INFO", 3, "routeviews existing asn %s", asn)
                    info.asn = self.asnmap[asn]
                else:
                    debug("IP2INFO", 2, "routeviews new asn %s, cidr %s", asn,
                          info.cidr)
                    info.asn = self.asnmap[asn] = ASInfo(asn=asn)
                    tasks.insert(1, IPTask(self._asn_lookup_cc, 2))

        # Do reverse TXT lookup on IP address from routeviews.org DNS.
        revaddr = info.ip.reverse_dns
        if revaddr.endswith(".in-addr.arpa."):
            rev = revaddr[:-14] + ".asn.routeviews.org"
        elif revaddr.endswith(".ip6.arpa."):
            rev = revaddr[:-10] + ".asn.routeviews.org"
        else:
            assert False, "reverse address %s not recognised" % revaddr
        debug("IP2INFO", 3, "submitting asn lookup %s", rev)
        self._submit(rev, rr.TXT, responder)
        return False
Exemple #24
0
    def _save_text(self, timebins, svckeys, stats, statdir):
        debug("LOGREADER", 1, "saving text summary")

        def underline(svc, line):
            return "  " + svc.service + line[len(svc.service):]

        def format(v):
            return (
                (" %10d" + " %10.1f" * 8 + " %7d" * 9) %
                (v.total.timing.num, v.total.timing.avg, v.total.timing.rms,
                 v.total.timing.min, v.total.timing.max, v.total.bytes.avg,
                 v.total.bytes.rms, v.total.bytes.min, v.total.bytes.max,
                 len(v.methods), len(v.codes), len(v.users), len(
                     v.browsers), len(v.ips), len(v.hosts), len(
                         v.domains), len(v.countries), len(v.locations)))

        result = StringIO()
        titles1 = ("#request", "dt avg", "dt rms", "dt min", "dt max",
                   "out avg", "out rms", "out min", "out max")
        titles2 = ("#method", "#code", "#user", "#brwser", "#ip", "#host",
                   "#domain", "#c'ntry", "#loc'n")
        line = "_" * (11 * len(titles1) + 8 * len(titles2) - 2)

        result.write(" " * 11)
        result.writelines(underline(svc, line) for svc in svckeys)
        result.write("\n")
        result.write("%-10s" % "Date")
        result.writelines((" %10s" * len(titles1) + " %7s" * len(titles2)) %
                          (titles1 + titles2) for svc in svckeys)
        result.write("\n")

        # Now add stats per service for each time bin.
        for timebin in timebins:
            result.write("%-10s" % timebin)
            result.writelines(format(stats[timebin][svc]) for svc in svckeys)
            result.write("\n")

        # Write it out.
        open(self.statfile, "w").write(result.getvalue())
Exemple #25
0
    def _load(self):
        """Open GeoIP city database.

    Reloads the database from upstream database if there is no locally
    cached file, or the file is too old. Saves the cached data in
    uncompressed format, ready for opening. Opens the file for use.
    """
        if not os.path.exists(self.path) \
           or time() - os.stat(self.path)[ST_MTIME] >= 15 * 86400:
            debug("GEOIP", 1, "downloading new geoip database")
            newpath = self.path + ".new"
            newfile = open(newpath, "w")
            dbfile = urllib2.urlopen(self.url).read()
            newfile.write(GzipFile(fileobj=StringIO(dbfile)).read())
            newfile.close()
            if os.path.exists(self.path):
                os.remove(self.path)
            os.rename(newpath, self.path)

        debug("GEOIP", 1, "opening geoip database %s", self.path)
        self.gip = GeoIP.open(self.path, GeoIP.GEOIP_MEMORY_CACHE)
        self.gip.set_charset(GeoIP.GEOIP_CHARSET_UTF8)
Exemple #26
0
    def domain(self, hostname):
        """Translate host name to a domain name using the public suffix list.

    @param -- string, the host name to look up

    @return string with the domain portion of `hostname`.
    """
        dir = self.psl
        domain = []

        insuffix = True
        parts = hostname.split(".")[::-1]
        first = True
        while len(parts):
            debug("PSL", 3,
                  "hostname %s domain %s parts %s insuffix %s dir %s",
                  hostname, domain, parts, insuffix, (first and "{(all)}")
                  or dir)
            part = parts.pop(0)
            if insuffix:
                domain.append(part)
            first = False
            if part in dir:
                if isinstance(dir[part], dict):
                    dir = dir[part]
                else:
                    insuffix = not dir[part]
                    dir = {}
            elif "*" in dir:
                insuffix = not dir["*"]
                dir = {}
            else:
                break

        domname = ".".join(domain[::-1])
        debug("PSL", 2, "hostname %s mapped to domain %s", hostname, domname)
        return domname
Exemple #27
0
 def responder(answer, addr):
     debug("IP2INFO", 3, "routeviews result %s %s %s", addr, info.ip,
           answer)
     self._tick_stats(answer[0])
     task.ongoing = False
     if len(answer[3]) > 0 and len(answer[3][0]) == 3:
         task.done = True
         (asn, cidr, w) = answer[3][0]
         info.cidr = IPNetwork("%s/%s" % (cidr, w))
         if asn in self.asnmap:
             debug("IP2INFO", 3, "routeviews existing asn %s", asn)
             info.asn = self.asnmap[asn]
         else:
             debug("IP2INFO", 2, "routeviews new asn %s, cidr %s", asn,
                   info.cidr)
             info.asn = self.asnmap[asn] = ASInfo(asn=asn)
             tasks.insert(1, IPTask(self._asn_lookup_cc, 2))
Exemple #28
0
 def responder(answer, addr):
     debug("IP2INFO", 3, "cymru result %s %s %s", addr, info.ip, answer)
     self._tick_stats(answer[0])
     task.ongoing = False
     if len(answer[3]) > 0 and len(answer[3][-1]) == 1:
         m = RX_ASN.match(answer[3][-1][0])
         if m:
             task.done = True
             if m.group(1) in self.asnmap:
                 debug("IP2INFO", 3, "cymru existing asn %s",
                       m.group(1))
                 info.asn = self.asnmap[m.group(1)]
             else:
                 debug("IP2INFO", 2, "cymru new asn %s, cidr %s, cc %s",
                       m.group(1), m.group(2), m.group(3))
                 tasks.insert(1, IPTask(self._asn_lookup_cc, 2))
                 info.asn = self.asnmap[m.group(1)] = \
                     ASInfo(asn = m.group(1),
                            cc = m.group(3),
                            rir = m.group(4),
                            date = m.group(5))
Exemple #29
0
    def _save_yaml(self, timebins, svckeys, stats, statdir):
        debug("LOGREADER", 1, "saving yaml summary")

        def pad(n, s):
            padding = " " * n
            return padding + s.rstrip().replace("\n", "\n" + padding) + "\n"

        def maxcnt(timebin, svc):
            if timebin == "TOTAL" and svc.service == "ALL":
                return 50
            elif timebin == "TOTAL" or svc.service == "ALL":
                return 25
            else:
                return 10

        def slice(vals, nmax):
            items = sorted(vals.items(),
                           reverse=True,
                           key=lambda kv: kv[1].timing.num)
            if nmax < 0 or len(items) <= nmax + 1:
                return items

            result = items[0:nmax]
            tail = items[nmax:]
            k = "%d others" % len(tail)
            v = sum((kv[1] for kv in tail), StatsByteTiming())
            result.append((k, v))
            return result

        def format_quant(q):
            return (
                "{ tot: %.1f, avg: %.1f, rms: %.1f, min: %.1f, max: %.1f }" %
                (q.sum, q.avg, q.rms, q.min, q.max))

        def format_list(vals, nmax):
            return "".join("- %s:\n"
                           "    nr of requests: %d\n"
                           "    response time:  %s\n"
                           "    kilobytes out:  %s\n" %
                           ((k == "-" and "(Unidentified)" or k), v.timing.num,
                            format_quant(v.timing), format_quant(v.bytes))
                           for k, v in slice(vals, nmax))

        def format_svc(s):
            return ("service:        %s\n"
                    "instance:       %s\n"
                    "subclass:       %s\n"
                    "api:            %s\n" % s)

        def format_val(v, nmax):
            return ("nr of requests: %d\n"
                    "response time:  %s\n"
                    "kilobytes out:  %s\n"
                    "http methods:\n%s"
                    "http codes:\n%s"
                    "users:\n%s"
                    "browsers:\n%s"
                    "ip addresses:\n%s"
                    "hostnames:\n%s"
                    "domains:\n%s"
                    "countries:\n%s"
                    "locations:\n%s" %
                    (v.total.timing.num, format_quant(
                        v.total.timing), format_quant(v.total.bytes),
                     pad(2, format_list(
                         v.methods, nmax)), pad(2, format_list(v.codes, nmax)),
                     pad(2, format_list(v.users, nmax)),
                     pad(2, format_list(
                         v.browsers, nmax)), pad(2, format_list(
                             v.ips, nmax)), pad(2, format_list(v.hosts, nmax)),
                     pad(2, format_list(v.domains, nmax)),
                     pad(2, format_list(v.countries, nmax)),
                     pad(2, format_list(v.locations, nmax))))

        # Output overall summary for each time bin.
        result = gzopen("%s-yml/summary.yml.gz" % statdir, "wb", 9)
        result.write("---\n")
        for timebin in timebins:
            result.write("%s:\n" % timebin)
            result.writelines(
                "  - %s:\n%s" %
                (svc.service,
                 pad(6, format_val(stats[timebin][svc], maxcnt(timebin, svc))))
                for svc in svckeys
                if stats[timebin][svc].total.timing.num != 0)
        result.close()

        # Output per-timebin detailed files, including one for "TOTAL".
        for timebin in timebins:
            result = gzopen("%s-yml/%s.yml.gz" % (statdir, timebin), "wb", 9)
            result.write("---\n")
            result.write("%s:\n" % timebin)
            for svc in svckeys:
                if stats[timebin][svc].total.timing.num != 0:
                    result.write("  - %s:\n" "    - total:\n" % svc.service)
                    result.write(
                        pad(
                            8,
                            format_val(stats[timebin][svc],
                                       maxcnt(timebin, svc))))

                    items = sorted(
                        ((s, v) for s, v in stats[timebin].iteritems()
                         if s.service == svc.service and s not in svckeys),
                        key=lambda sv: sv[1].total.timing.sum,
                        reverse=True)
                    result.write("    - services by time:\n")
                    result.writelines(
                        "      - %d:\n%s%s" % (n, pad(10, format_svc(sv[0])),
                                               pad(10, format_val(sv[1], 7)))
                        for n, sv in zip(xrange(len(items)), items))

                    items = sorted(
                        ((s, v) for s, v in stats[timebin].iteritems()
                         if s.service == svc.service and s not in svckeys),
                        key=lambda sv: sv[1].total.bytes.sum,
                        reverse=True)

                    result.write("    - services by output:\n")
                    result.writelines(
                        "      - %d:\n%s%s" % (n, pad(10, format_svc(sv[0])),
                                               pad(10, format_val(sv[1], 7)))
                        for n, sv in zip(xrange(len(items)), items))
            result.close()
Exemple #30
0
    def __init__(self,
                 cachedir=None,
                 gip=None,
                 psl=None,
                 res=None,
                 maxtime=30,
                 maxtries=3):
        """Constructor.

    Initialises the lookup object so it is ready for queries.

    @param cachedir -- Default location for caching databases, used if
    `gip` or `psl` have not been specified. If unset and neither `gip`
    nor `psl` arguments are provided, the databases are cached in the
    current directory.

    @param gip -- Reference to GeoIPLookup object. If None, a new
    object is created, using `cachedir` or current directory as the
    location for the city database file.

    @param psl -- Reference to PublicSuffixLookup object. If None, a
    new object is created, using `cachedir` or current directory as
    the location for the YAML cache database.

    @param res -- Reference to adns DNS resolver object. If None, a
    new resolver is created. If you want to use a nameserver other
    than your system default one, pass in custom adns object created
    with the appropriate "nameserver x.y.z.w" resolver argument.

    @param maxtime -- The maximum time to wait for DNS replies. Some
    DNS servers are slow to respond so some queries take a long time
    to complete, or will simply time out. If the client is submitting
    large numbers of addresses for query, the stragglers are handled
    automatically and there is no reason to reduce the query time-out.
    However if the client has just a few addresses to resolve, or is
    in a hurry to get the answer, set `maxtime` to some smaller value.

    @param maxtries -- The maximum number of times to attempt main
    queries per IP address. In general this value should be greater
    than one to avoid failures resulting from dropped DNS packets and
    to catch straggler responses from slow, far away and somewhat
    misconfigured DNS servers. More than three rarely improves the
    accuracy of the results.
    """
        now = time()
        self.maxtime = maxtime
        self.maxtries = maxtries
        geopath = (cachedir and "%s/GeoLiteCity.dat" % cachedir)
        pslpath = (cachedir and "%s/psl.yml" % cachedir)
        self.res = res or adns.init(adns.iflags.noautosys)
        debug("IP2INFO", 2, "dns resolver initialised %.2f", time() - now)
        self.gip = gip or GeoIPLookup(path=geopath)
        debug("IP2INFO", 2, "geoip resolver initialised %.2f", time() - now)
        self.psl = psl or PublicSuffixLookup(path=pslpath)
        debug("IP2INFO", 2, "domain resolver initialised %.2f", time() - now)
        self.ptrfail = set()
        self.ptrmap = {}
        self.asnmap = {}
        self.queries = {}
        self.ipaddrs = {}
        self.resstat = {}
        for cidr in self.UNAVAILABLE.iter_cidrs():
            asn = "@%s" % str(cidr)
            self.asnmap[asn] = ASInfo(asn=asn, cc="--")
        debug("IP2INFO", 1, "initialisation complete %.2f", time() - now)
Exemple #31
0
    def _issue(self):
        """Process task queue.

    Walks over all IP addresses and examines their pending task queue.
    Starts new tasks while there are IP addresses with queued tasks we
    can issue. Keeps issuing tasks as long as something can start.

    @return Number of pending tasks left.
    """
        debug("IP2INFO", 3, "issuing tasks for %d addresses",
              len(self.ipaddrs))
        now = time()
        npending = ntasks = 0
        done = False
        while not done:
            npending = 0
            ntasks = 0
            ncompleted = 0
            naborted = 0
            nongoing = 0
            nretried = 0

            for info, tasks in self.ipaddrs.values():
                # If this IP address has no more tasks, skip it.
                if len(tasks) == 0:
                    continue

                # Count this address as pending tasks.
                ntasks += len(tasks)
                npending += 1
                task = tasks[0]

                # If the top task is done, reap it.
                if task.done:
                    tasks.pop(0)
                    ncompleted += 1
                    continue

                # If the top task is till ongoing, don't disturb.
                if task.ongoing:
                    nongoing += 1
                    continue

                # Maybe start the top task.
                task.ongoing = True

                # If the task has already been attempted enough, give up.
                if task.tries >= task.maxtries:
                    tasks.pop(0)
                    naborted += 1

                # Try to run the task. If it says it's done, reap it.
                elif task(info, task, tasks):
                    tasks.pop(0)
                    ncompleted += 1

                # Otherwise we left the task running, move on.
                else:
                    task.tries += 1
                    nretried += 1

            # Report where we are. Keep going if we have no pending tasks at
            # all, or all of them are already ongoing.
            done = (npending == 0 or npending == nongoing)
            debug(
                "IP2INFO", 3,
                "task status after %.3f s: %d pending, %d tasks, %d completed,"
                " %d aborted, %d ongoing, %d (re)tried, will %srepeat",
                time() - now, npending, ntasks, ncompleted, naborted, nongoing,
                nretried, (done and "not ") or "")

        # Report time spent issuing tasks.
        debug("IP2INFO", 2,
              "done issuing tasks in %.3f s, %d pending, %d tasks",
              time() - now, npending, ntasks)
        return npending
Exemple #32
0
  def _save_json(self, timebins, svckeys, stats, statdir):
    debug("LOGREADER", 1, "saving json summary")
    strings = {}

    def maxcnt(timebin, svc):
      if timebin == "TOTAL" and svc.service == "ALL":
        return 50
      elif timebin == "TOTAL" or svc.service == "ALL":
        return 25
      else:
        return 10

    def mkstr(s):
      if s in strings:
        return strings[s]
      else:
        id = len(strings)
        strings[s] = id
        return id

    def slice(vals, nmax):
      items = sorted(vals.items(), reverse=True, key=lambda kv: kv[1].timing.num)
      if nmax < 0 or len(items) <= nmax+1:
	return items

      result = items[0:nmax]
      tail = items[nmax:]
      k = "%d others" % len(tail)
      v = sum((kv[1] for kv in tail), StatsByteTiming())
      result.append((k, v))
      return result

    def format_quant(q):
      return ('[%.3f,%.3f,%.3f,%.3f,%.3f]'
              % (q.num, q.sum, q.sum2, q.min, q.max))

    def format_list(vals, nmax, pad):
      return (",\n" + pad).join('[%d,%d,%s,%s]' %
                                (mkstr(k == "-" and "(Unidentified)" or k),
			         v.timing.num,
			         format_quant(v.timing),
			         format_quant(v.bytes))
		                for k, v in slice(vals, nmax))

    def format_svc(s):
      return '%d,%d,%d,%d' % (mkstr(s.service), mkstr(s.instance),
                              mkstr(s.subclass), mkstr(s.api))

    def format_columns():
      return ('"nreq","total","hmethod","hcode","user","browser"'
	      ',"ip","host","domain","country","location"')

    def format_val(v, nmax, pad):
      lpad = pad + " "
      return (('%d,[%s,%s]' + (',\n' + pad + '[%s]') * 9) %
              (v.total.timing.num,
               format_quant(v.total.timing),
	       format_quant(v.total.bytes),
               format_list(v.methods, nmax, lpad),
	       format_list(v.codes, nmax, lpad),
	       format_list(v.users, nmax, lpad),
	       format_list(v.browsers, nmax, lpad),
	       format_list(v.ips, nmax, lpad),
	       format_list(v.hosts, nmax, lpad),
	       format_list(v.domains, nmax, lpad),
	       format_list(v.countries, nmax, lpad),
               format_list(v.locations, nmax, lpad)))

    # Output overall summary for each time bin.
    strings = {}
    result = open("%s-json/summary.txt" % statdir, "wb")
    result.write('{"data":[\n')
    for timebin in timebins:
      result.write('{"bin":"%s","items":[\n' % timebin)
      result.write(',\n'.join
		   ('[%d,%s]' %
		    (mkstr(svc.service),
		     format_val(stats[timebin][svc],
				maxcnt(timebin, svc),
				" "))
                    for svc in svckeys
                    if stats[timebin][svc].total.timing.num != 0))
      result.write(']}')
      if timebin != timebins[-1]:
        result.write(",")
      result.write('\n')
    result.write('],"columns":["service",%s],"strings":[\n' % format_columns())
    result.write(',\n'.join
		 ('"%s"' % s for id, s in
	          sorted(zip(strings.values(), strings.keys()))))
    result.write("]}\n")
    result.close()

    # Output per-timebin detailed files, including one for "TOTAL".
    for timebin in timebins:
      strings = {}
      result = open("%s-json/%s.txt" % (statdir, timebin), "wb")
      result.write('{"data":[\n')
      result.write('{"bin":"%s","items":[\n' % timebin)
      skeys = [s for s in svckeys if stats[timebin][s].total.timing.num != 0]
      for svc in skeys:
        result.write('[%d,%s,[\n'
		     % (mkstr(svc.service),
			format_val(stats[timebin][svc], maxcnt(timebin, svc), " ")))

        items = sorted(((s, v) for s, v in stats[timebin].iteritems()
                        if s.service == svc.service and s not in svckeys),
                       key = lambda sv: sv[1].total.timing.num,
                       reverse = True)

        result.write(',\n'.join
		     ("  [%s,%s]" %
		      (format_svc(s), format_val(v, maxcnt(timebin, s), "   "))
                      for s, v in items))
        result.write(" ]]")
        if svc != skeys[-1]:
          result.write(",")
        result.write("\n")
      result.write(']}],"columns":["service",%s,"detail"],"strings":[\n' % format_columns())
      result.write(',\n'.join
                   ('"%s"' % s for id, s in
	            sorted(zip(strings.values(), strings.keys()))))
      result.write("]}\n")
      result.close()
Exemple #33
0
  def _save_yaml(self, timebins, svckeys, stats, statdir):
    debug("LOGREADER", 1, "saving yaml summary")

    def pad(n, s):
      padding = " " * n
      return padding + s.rstrip().replace("\n", "\n" + padding) + "\n"

    def maxcnt(timebin, svc):
      if timebin == "TOTAL" and svc.service == "ALL":
        return 50
      elif timebin == "TOTAL" or svc.service == "ALL":
        return 25
      else:
        return 10

    def slice(vals, nmax):
      items = sorted(vals.items(), reverse=True, key=lambda kv: kv[1].timing.num)
      if nmax < 0 or len(items) <= nmax+1:
	return items

      result = items[0:nmax]
      tail = items[nmax:]
      k = "%d others" % len(tail)
      v = sum((kv[1] for kv in tail), StatsByteTiming())
      result.append((k, v))
      return result

    def format_quant(q):
      return ("{ tot: %.1f, avg: %.1f, rms: %.1f, min: %.1f, max: %.1f }"
              % (q.sum, q.avg, q.rms, q.min, q.max))

    def format_list(vals, nmax):
      return "".join("- %s:\n"
                     "    nr of requests: %d\n"
                     "    response time:  %s\n"
                     "    kilobytes out:  %s\n" %
                     ((k == "-" and "(Unidentified)" or k),
                      v.timing.num,
                      format_quant(v.timing),
                      format_quant(v.bytes))
                     for k, v in slice(vals, nmax))

    def format_svc(s):
      return ("service:        %s\n"
              "instance:       %s\n"
              "subclass:       %s\n"
              "api:            %s\n"
              % s)

    def format_val(v, nmax):
      return ("nr of requests: %d\n"
              "response time:  %s\n"
              "kilobytes out:  %s\n"
              "http methods:\n%s"
              "http codes:\n%s"
              "users:\n%s"
              "browsers:\n%s"
              "ip addresses:\n%s"
              "hostnames:\n%s"
              "domains:\n%s"
              "countries:\n%s"
              "locations:\n%s" %
              (v.total.timing.num,
               format_quant(v.total.timing),
               format_quant(v.total.bytes),
               pad(2, format_list(v.methods, nmax)),
               pad(2, format_list(v.codes, nmax)),
               pad(2, format_list(v.users, nmax)),
               pad(2, format_list(v.browsers, nmax)),
               pad(2, format_list(v.ips, nmax)),
               pad(2, format_list(v.hosts, nmax)),
               pad(2, format_list(v.domains, nmax)),
               pad(2, format_list(v.countries, nmax)),
               pad(2, format_list(v.locations, nmax))))

    # Output overall summary for each time bin.
    result = gzopen("%s-yml/summary.yml.gz" % statdir, "wb", 9)
    result.write("---\n")
    for timebin in timebins:
      result.write("%s:\n" % timebin)
      result.writelines("  - %s:\n%s" %
                        (svc.service,
                         pad(6, format_val(stats[timebin][svc],
                                           maxcnt(timebin, svc))))
                        for svc in svckeys
                        if stats[timebin][svc].total.timing.num != 0)
    result.close()

    # Output per-timebin detailed files, including one for "TOTAL".
    for timebin in timebins:
      result = gzopen("%s-yml/%s.yml.gz" % (statdir, timebin), "wb", 9)
      result.write("---\n")
      result.write("%s:\n" % timebin)
      for svc in svckeys:
        if stats[timebin][svc].total.timing.num != 0:
          result.write("  - %s:\n"
                       "    - total:\n" %
                       svc.service)
          result.write(pad(8, format_val(stats[timebin][svc],
                                         maxcnt(timebin, svc))))

          items = sorted(((s, v) for s, v in stats[timebin].iteritems()
                          if s.service == svc.service and s not in svckeys),
                         key = lambda sv: sv[1].total.timing.sum,
                         reverse = True)
          result.write("    - services by time:\n")
          result.writelines("      - %d:\n%s%s" %
                            (n,
                             pad(10, format_svc(sv[0])),
                             pad(10, format_val(sv[1], 7)))
                            for n, sv in zip(xrange(len(items)), items))

          items = sorted(((s, v) for s, v in stats[timebin].iteritems()
                          if s.service == svc.service and s not in svckeys),
                         key = lambda sv: sv[1].total.bytes.sum,
                         reverse = True)

          result.write("    - services by output:\n")
          result.writelines("      - %d:\n%s%s" %
                            (n,
                             pad(10, format_svc(sv[0])),
                             pad(10, format_val(sv[1], 7)))
                            for n, sv in zip(xrange(len(items)), items))
      result.close()
Exemple #34
0
    def _save_json(self, timebins, svckeys, stats, statdir):
        debug("LOGREADER", 1, "saving json summary")
        strings = {}

        def maxcnt(timebin, svc):
            if timebin == "TOTAL" and svc.service == "ALL":
                return 50
            elif timebin == "TOTAL" or svc.service == "ALL":
                return 25
            else:
                return 10

        def mkstr(s):
            if s in strings:
                return strings[s]
            else:
                id = len(strings)
                strings[s] = id
                return id

        def slice(vals, nmax):
            items = sorted(vals.items(),
                           reverse=True,
                           key=lambda kv: kv[1].timing.num)
            if nmax < 0 or len(items) <= nmax + 1:
                return items

            result = items[0:nmax]
            tail = items[nmax:]
            k = "%d others" % len(tail)
            v = sum((kv[1] for kv in tail), StatsByteTiming())
            result.append((k, v))
            return result

        def format_quant(q):
            return ('[%.3f,%.3f,%.3f,%.3f,%.3f]' %
                    (q.num, q.sum, q.sum2, q.min, q.max))

        def format_list(vals, nmax, pad):
            return (",\n" + pad).join(
                '[%d,%d,%s,%s]' %
                (mkstr(k == "-" and "(Unidentified)" or k), v.timing.num,
                 format_quant(v.timing), format_quant(v.bytes))
                for k, v in slice(vals, nmax))

        def format_svc(s):
            return '%d,%d,%d,%d' % (mkstr(s.service), mkstr(
                s.instance), mkstr(s.subclass), mkstr(s.api))

        def format_columns():
            return ('"nreq","total","hmethod","hcode","user","browser"'
                    ',"ip","host","domain","country","location"')

        def format_val(v, nmax, pad):
            lpad = pad + " "
            return (('%d,[%s,%s]' + (',\n' + pad + '[%s]') * 9) % (
                v.total.timing.num, format_quant(v.total.timing),
                format_quant(v.total.bytes), format_list(
                    v.methods, nmax, lpad), format_list(v.codes, nmax, lpad),
                format_list(v.users, nmax,
                            lpad), format_list(v.browsers, nmax, lpad),
                format_list(v.ips, nmax, lpad), format_list(
                    v.hosts, nmax, lpad), format_list(v.domains, nmax, lpad),
                format_list(v.countries, nmax,
                            lpad), format_list(v.locations, nmax, lpad)))

        # Output overall summary for each time bin.
        strings = {}
        result = open("%s-json/summary.txt" % statdir, "wb")
        result.write('{"data":[\n')
        for timebin in timebins:
            result.write('{"bin":"%s","items":[\n' % timebin)
            result.write(',\n'.join(
                '[%d,%s]' %
                (mkstr(svc.service),
                 format_val(stats[timebin][svc], maxcnt(timebin, svc), " "))
                for svc in svckeys
                if stats[timebin][svc].total.timing.num != 0))
            result.write(']}')
            if timebin != timebins[-1]:
                result.write(",")
            result.write('\n')
        result.write('],"columns":["service",%s],"strings":[\n' %
                     format_columns())
        result.write(',\n'.join(
            '"%s"' % s
            for id, s in sorted(zip(strings.values(), strings.keys()))))
        result.write("]}\n")
        result.close()

        # Output per-timebin detailed files, including one for "TOTAL".
        for timebin in timebins:
            strings = {}
            result = open("%s-json/%s.txt" % (statdir, timebin), "wb")
            result.write('{"data":[\n')
            result.write('{"bin":"%s","items":[\n' % timebin)
            skeys = [
                s for s in svckeys if stats[timebin][s].total.timing.num != 0
            ]
            for svc in skeys:
                result.write('[%d,%s,[\n' %
                             (mkstr(svc.service),
                              format_val(stats[timebin][svc],
                                         maxcnt(timebin, svc), " ")))

                items = sorted(
                    ((s, v) for s, v in stats[timebin].iteritems()
                     if s.service == svc.service and s not in svckeys),
                    key=lambda sv: sv[1].total.timing.num,
                    reverse=True)

                result.write(',\n'.join(
                    "  [%s,%s]" %
                    (format_svc(s), format_val(v, maxcnt(timebin, s), "   "))
                    for s, v in items))
                result.write(" ]]")
                if svc != skeys[-1]:
                    result.write(",")
                result.write("\n")
            result.write(
                ']}],"columns":["service",%s,"detail"],"strings":[\n' %
                format_columns())
            result.write(',\n'.join(
                '"%s"' % s
                for id, s in sorted(zip(strings.values(), strings.keys()))))
            result.write("]}\n")
            result.close()
Exemple #35
0
def cmsweb_parser(rows, qresolver, start_time, end_time, time_format):
    """Classify log entries to higher-level info

  Processes log entries from `rows`. Rows with time outside the range
  from start_time to end_time are discarded. The rest are accumulated
  to results: a dictionary of timebin to dictionary of `StatsKey` to
  `StatsData`.

  URIs are mapped to `StatsKey` using `URI2SVC`. The user agent
  strings are mapped to more comprehensible ones using `BROWSER2NAME`.
  The log row's time is mapped to timebin using `time_format`.

  @param rows -- iterable yielding `RXLOG` match groups.
  @param qresolver -- queue for IP address resolution requests.
  @param start_time -- beginning of reporting period.
  @param end_time -- end of reporting period.
  @param time_format -- `strftime` string format for the reporting time bins.

  @return Dictionary of results, as described above.
  """
    # Parse the files, accumulating stats results.
    global _browsermap, _seenip  # share across calls within one process
    newip = set()
    stats = {}

    # Read log entries.
    for row in rows:
        (date, tz, ip, uri, code, bytes, usecs, user, browser) = row

        # Request new IPs to be resolved on background.
        if ip not in _seenip:
            _seenip.add(ip)
            newip.add(ip)
            if len(newip) >= 50:
                qresolver.put(newip)
                newip = set()

        # Locate HTTP method. It should be the first word on the line.
        # Some attacks feed complete junk, so auto-classlify as attack
        # if it doesn't look like a valid HTTP request.
        key = None
        uribeg = 0
        uriend = len(uri)
        uripart = uri.find(" ")
        method = uri[0:uripart]
        if not method.isupper():
            key = StatsKey("attacks", "N/A", "N/A", uri)
        else:
            uribeg = uripart + 1

        # Remove trailing " HTTP/n.m" from the URI. Some attacks make
        # malformed requests without the required HTTP/n.m part. Making
        # RXLOG match it optionally significantly slows down log pattern
        # matching, so handle it manually here.
        if uriend > 9 and uri.find(" HTTP/", uriend - 9, uriend - 3) >= 0:
            uriend -= 9
        else:
            key = StatsKey("attacks", "N/A", "N/A", uri)

        # Skip all but first leading slash, web server has similar behaviour.
        while uribeg + 1 < uriend and uri[uribeg] == "/" and uri[uribeg + 1] == "/":
            uribeg += 1

        # Resolve log time.
        #
        # The strptime() version is twice as slow, so do it by hand.
        # strptime() also doesn't do +nnnn timezones, so we have to do
        # that part ourselves. Verify the UTC time is in the window we
        # want to report on.
        #
        # t = timegm(strptime(date, "%d/%b/%Y:%H:%M:%S")) \
        #     - (int(tz[0:3]) * 3600 + int(tz[3:5]) * 60)
        t = timegm(
            (
                int(date[7:11]),
                MONTH[date[3:6]],
                int(date[0:2]),
                int(date[12:14]),
                int(date[15:17]),
                int(date[18:20]),
                0,
                0,
                -1,
            )
        ) - (int(tz[0:3]) * 3600 + int(tz[3:5]) * 60)

        # Check the entry is in selected time window. We get sent logs
        # for an extended period before and after the actual period to
        # handle moderately out of order logs.
        if t < start_time or t >= end_time:
            continue

        # Resolve user agent into a more useful browser name/version.
        if browser not in _browsermap:
            newname = browser
            for rx, subst in BROWSER2NAME:
                m = rx.search(browser)
                if m:
                    newname = m.expand(subst)
                    break
            _browsermap[browser] = newname
            debug("CMSWEB", 2, "new browser %s from %s", newname, browser)
        browser = _browsermap[browser]

        # Map URI to service, instance, subclass and API call, provided
        # we didn't already classify this as a bad request attack.
        if key == None:
            for rx, nsvc, ninst, nclass, napi in URI2SVC:
                m = rx.match(uri, uribeg, uriend)
                if m:
                    key = StatsKey(
                        service=expand(m, nsvc, "Other"),
                        instance=expand(m, ninst, "N/A"),
                        subclass=expand(m, nclass, "N/A"),
                        api=expand(m, napi),
                    )
                    break

        if key == None:
            key = StatsKey(service="other", instance="N/A", subclass="N/A", api=re.sub(r"\?.*", "", uri[uribeg:uriend]))

        # Create time bin if necessary.
        timebin = strftime(time_format, gmtime(t))
        if timebin not in stats:
            stats[timebin] = {}

        if key not in stats[timebin]:
            stats[timebin][key] = StatsData()

        # If reported time stats are negative, flip it to positive. This is
        # pretty much completely bogus, but works better than ignoring the
        # entry entirely - these happen relatively rarely on virtual machines
        # when time keeping breaks down, for example during IRQ storms.
        usecs = float(usecs) * 1e-3
        if usecs < 0:
            usecs = -usecs

        # Convert proxies to real users.
        user = user.replace("/CN=proxy", "")

        # Tick the stats.
        stats[timebin][key].tick(float(bytes) / 1024, usecs, method, code, user, browser, ip, None, None, None, None)

    # Submit remaining IP addresses for query.
    if newip:
        qresolver.put(newip)

    # Return the stats.
    return stats
Exemple #36
0
 def responder(hostname):
     debug("IP2INFO", 2, "hostname %s %s", info.ip, hostname)
     task.ongoing = False
     if hostname != None:
         info.hostname = hostname
         task.done = True