Exemple #1
0
    def _images(self, task, page, match, images):
        debug(self._ID, 2, "%s: scanning images in %s", task.key, task.url)
        img = 0
        for imgtag in re.findall(self._rximg, page.replace("\n", " ")):
            while True:
                m = self._rxattr.match(imgtag)
                if not m:
                    break

                arg = m.group(2)
                if len(arg) >= 2 and arg[0] == '"' and arg[-1] == '"':
                    arg = arg[1:-1]
                elif len(arg) >= 2 and arg[0] == "'" and arg[-1] == "'":
                    arg = arg[1:-1]

                if m.group(1) == "src" and re.search(match, arg):
                    arg = arg.replace("&", "&").replace(" ", "%20")
                    url = urlparse.urljoin(task.url, arg)
                    key = task.key[:-1] + (images[img], )
                    debug(self._ID, 2,
                          "%s: retrieving image #%d %s from %s (%s)", key,
                          img + 1, images[img], url, arg)
                    self._reqman.put(
                        Task(url, key, task.period, "image/*", self._pngdata,
                             None))
                    img += 1
                imgtag = imgtag[m.end():]

        if img != len(images):
            cherrypy.log("SCRAPER WARNING %s found %d of %d images" %
                         (task.url, img, len(images)))

        return "x"
Exemple #2
0
  def _images(self, task, page, match, images):
    debug(self._ID, 2, "%s: scanning images in %s", task.key, task.url)
    img = 0
    for imgtag in re.findall(self._rximg, page.replace("\n", " ")):
      while True:
        m = self._rxattr.match(imgtag)
        if not m:
          break

        arg = m.group(2)
        if len(arg) >= 2 and arg[0] == '"' and arg[-1] == '"':
          arg = arg[1:-1]
        elif len(arg) >= 2 and arg[0] == "'" and arg[-1] == "'":
          arg = arg[1:-1]

        if m.group(1) == "src" and re.search(match, arg):
          arg = arg.replace("&", "&").replace(" ", "%20")
          url = urlparse.urljoin(task.url, arg)
          key = task.key[:-1] + (images[img],)
          debug(self._ID, 2, "%s: retrieving image #%d %s from %s (%s)",
                key, img+1, images[img], url, arg)
          self._reqman.put(Task(url, key, task.period, "image/*",
                                self._pngdata, None))
          img += 1
        imgtag = imgtag[m.end():]

    if img != len(images):
      cherrypy.log("SCRAPER WARNING %s found %d of %d images" %
                   (task.url, img, len(images)))

    return "x"
Exemple #3
0
  def _asn_lookup_cc(self, info, task, tasks):
    """Perform final step of AS lookups, verifying country code for the
    autonomous system from cymru.com database using DNS lookups. This
    is more accurate than the code returned by initial AS lookups."""
    debug("IP2INFO", 2, "asn lookup/cc %s %s", info.ip, info.asn.asn)

    # Double check ASN lookup was really successful.
    if not info.asn.asn or not info.asn.asn.isdigit():
      return True

    # Define responder to country code lookup from cymru.com. Expects
    # 1-tuple answer matching RX_ASN_CC. Parse the one reply received.
    def responder(answer, addr):
      debug("IP2INFO", 3, "cc result %s from %s: %s", addr, info.asn.asn, answer)
      self._tick_stats(answer[0])
      task.ongoing = False
      if len(answer[3]) > 0 and len(answer[3][0]) == 1:
        m = RX_ASN_CC.match(answer[3][0][0])
        if m and m.group(1) == info.asn.asn:
          debug("IP2INFO", 2, "cc assigning %s = %s", m.group(1), m.group(2))
          info.asn.cc = m.group(2)
          info.asn.rir = m.group(3)
          info.asn.date = m.group(4)
          info.asn.org = m.group(5)
          info.asn.desc = m.group(7)
          task.done = True

    debug("IP2INFO", 3, "submitting asn lookup %s", info.asn.asn)
    self._submit("as%s.asn.cymru.com" % info.asn.asn, rr.TXT, responder)
    return False
Exemple #4
0
  def _wild_lookup(self, info, task, tasks):
    """For addresses we have failed to reverse lookup, and failed to
    reverse lookup CIDR base address, try other addresses in the same
    CIDR block. If the CIDR is narrower than /24, scan it entirely,
    and otherwise scan the nearest /24 segment. Remember which ever
    name we first come up with."""
    debug("IP2INFO", 2, "wild lookup %s %s", info.ip, info.wildhost)

    if info.domain or info.hostname or info.cidrhost or info.wildhost:
      return True

    # FIXME: Handle IPv6 here.
    cidrw = (info.cidr.prefixlen >= 24 and info.cidr.prefixlen) or 25
    addrs = [xip for xip in IPNetwork("%s/%d" % (info.ip, cidrw))]

    # Define responder to handle results for nearby address scan.
    # Remember only the first result we receive.
    def responder(hostname, ip):
      debug("IP2INFO", 2, "wild result %s -> %s -> %s %d",
            info.ip, ip, hostname, len(addrs))
      addrs.remove(ip)
      task.ongoing = (len(addrs) > 0)
      if hostname != None:
        task.done = True
        task.ongoing = False
        if not info.wildhost:
          info.wildhost = hostname
          debug("IP2INFO", 2, "wild hostname found %s: %s",
                info.ip, info.wildhost)

    for xip in addrs[:]:
      debug("IP2INFO", 3, "wild hostname lookup %s", xip)
      self._submit_ptr(responder, xip, rr.PTRraw)

    return False
Exemple #5
0
    def scrape(self,
               section,
               url,
               images=None,
               match=None,
               period=900,
               urledit=None):
        """
    Register a HTML page to scrape for images matching a pattern.
    Images will be available for retrieval via `image()` using the
    names listed in `images`. If `images` and `match` are None, then
    scrapes images directly, otherwise scrapes images off an HTML
    page.

    :arg str section: identifier for images from this address
    :arg str url: html address where to retrieve page or image
    :arg callable urledit: dynamically modify url before request
    :arg list(str) images: list of image names
    :arg re match: regular expression to match image(s)
    :arg int period: interval in seconds between checks
    """
        debug(self._ID, 1, "%s: scrape %s, images %s match %s period %d",
              section, url, images, match, period)
        if match:
            ContentScraper.scrape(self, section, { "page": (url, urledit) },
                                  period = period, content_type = "text/html",
                                  convert = lambda t,c,v: \
                                    self._images(t, v, match, images))
        else:
            ContentScraper.scrape(self,
                                  section[:-1], {section[-1]: (url, urledit)},
                                  period=period,
                                  content_type="image/*",
                                  convert=self._pngdata)
Exemple #6
0
  def reload(self):
    """Read the public suffix list.

    Reloads the database from upstream database if there is no locally
    cached file, or the file is too old. Saves the cached data in YAML
    format, with punycoded names.  If the cache file exists and is
    valid, it is loaded as is.

    Please note that py2-yaml used for reading and writing YAML should
    have been built with libyaml for this to perform optimally.
    """
    if not os.path.exists(self.path) \
       or time() - os.stat(self.path)[ST_MTIME] >= 15 * 86400:
      debug("PSL", 1, "downloading fresh psl database")
      self._parse(urllib2.urlopen(self.url))
      newpath = self.path + ".new"
      newfile = open(newpath, "w")
      newfile.write(yaml.dump(self.psl, Dumper = YAMLDumper))
      newfile.close()
      if os.path.exists(self.path):
        os.remove(self.path)
      os.rename(newpath, self.path)
    else:
      debug("PSL", 1, "reading psl database %s", self.path)
      self.psl = yaml.load(open(self.path).read(), YAMLLoader)
Exemple #7
0
    def scrape(self,
               section,
               urls,
               content_type="application/json",
               period=900,
               convert=None):
        """
    Register URLs for scraping content. Usually the content is JSON but it can
    be something else too, like HTML. All the URLs will be fetched, converted
    using `convert` and stored.

    :arg str section: label for this item
    :arg int period: interval in seconds between checks
    :arg str content_type: expected content type in response
    :arg callable convert: response conversion, e.g. cjson.decode
    :arg dict urls: (title, url) or (title, (url, urledit)) of data to retrieve
    """
        debug(self._ID, 1, "%s: scrape %s, period %d, content type %s",
              section, urls, period, content_type)
        with self._cv:
            if isinstance(section, basestring): section = (section, )
            map(lambda title: self._put(section + (title, ), 0, None),
                urls.keys())
            self._scrape.append({
                "section": section,
                "period": period,
                "content_type": content_type,
                "urls": urls,
                "convert": convert
            })
            self._cv.notifyAll()
Exemple #8
0
  def _submit_ptr(self, callback, ip, type = rr.PTR):
    """Submit a DNS reverse lookup query, but avoid making duplicates.

    If there is already a query ongoing for the given reverse IP address,
    no new query is submitted but the address is cached in `self.ptrmap`
    so the callback knows to consider this new association. The purpose
    is to help caller make multiple reverse IP address queries for a
    given "destination" address, and avoid any excess ones when making
    network address scans for multiple "origin" address.

    Addresses which fail reverse lookup with permanent error such as
    NXDOMAIN are remembered. Future queries on those addresses are
    short-circuited and immediately invoke the `callback` without
    issuing a new DNS query.

    @param callback -- function to call back if the query is answered
    @param ip -- the reversed IP address to look up
    @param info -- the forward IP address this query updates
    @param type -- request record type (adns.rr.*).
    """
    # If known to fail, skip.
    if ip in self.ptrfail:
      callback(None, ip)
      return

    # Add to pending list of ptr lookups
    if ip not in self.ptrmap:
      self.ptrmap[ip] = []
    self.ptrmap[ip].append(callback)

    # Create DNS query if this is first pending lookup for this address.
    if len(self.ptrmap[ip]) == 1:
      debug("IP2INFO", 3, "submitting ptr lookup of %s", ip)
      self._submit(ip.reverse_dns, type, self._ptr_result, ip)
Exemple #9
0
  def domain(self, hostname):
    """Translate host name to a domain name using the public suffix list.

    @param -- string, the host name to look up

    @return string with the domain portion of `hostname`.
    """
    dir = self.psl
    domain = []

    insuffix = True
    parts = hostname.split(".")[::-1]
    first = True
    while len(parts):
      debug("PSL", 3, "hostname %s domain %s parts %s insuffix %s dir %s",
            hostname, domain, parts, insuffix, (first and "{(all)}") or dir)
      part = parts.pop(0)
      if insuffix:
	domain.append(part)
      first = False
      if part in dir:
        if isinstance(dir[part], dict):
	  dir = dir[part]
        else:
	  insuffix = not dir[part]
          dir = {}
      elif "*" in dir:
	insuffix = not dir["*"]
        dir = {}
      else:
	break

    domname = ".".join(domain[::-1])
    debug("PSL", 2, "hostname %s mapped to domain %s", hostname, domname)
    return domname
Exemple #10
0
  def process(self, waittime = 1):
    """Process DNS queries and callbacks for up to `waittime` seconds."""

    # Wait for any queries to complete. Issue any new tasks created.
    now = time()
    npending = self._issue()
    num = len(self.queries)
    until = now + waittime
    prevtime = now
    while len(self.queries):
      ndone = 0
      for q in self.res.completed(.25):
        (created, callback, args) = self.queries[q]
        del self.queries[q]
        callback(q.check(), *args)
        ndone +=1

      if ndone > 0:
        npending = self._issue()

      # See if we should quit. Throttle back if 'completed()' returned
      # quickly and we are busy looping.
      xnow = time()
      if xnow > until:
        break
      if xnow - prevtime < 0.1 and (npending or len(self.queries)):
        sleep(min(0.5, until - xnow))
      prevtime = xnow

    # Expire whatever was running too long, and report timing.
    self._timeout()
    debug("IP2INFO", 2, "processed %d dns queries in %.2f s,"
          " %d remain, %d pending",
          num, time() - now, len(self.queries), npending)
    return npending
Exemple #11
0
  def _cname_lookup(self, info, task, tasks):
    """Issue canonical name lookup for a host name. For the occasional
    poorly configured sites with CNAME linked to another CNAME, issues
    a few levels of recursive requests to get to a final host name."""
    debug("IP2INFO", 2, "cname lookup %s %s", info.hostname, info.cnames)

    # Report ready if we already have a result.
    if info.cnames:
      return True

    # Do DNS CNAME lookup.
    def responder(answer, addr):
      debug("IP2INFO", 2, "cname result %s from %s: %s",
            addr, info.hostname, answer)
      self._tick_stats(answer[0])
      task.done = True
      task.ongoing = False
      if addr not in info.cnames:
        info.cnames[addr] = []
      for cname in answer[3]:
        info.all_names.update((cname,))
        cnames = info.cnames[addr]
        cnames.append(cname)
        if len(cnames) < 5:
          self._submit(cname, rr.CNAME, responder)
          task.ongoing = True
          task.done = False

    self._submit(info.hostname, rr.CNAME, responder)
    return False
Exemple #12
0
  def _addr_lookup(self, info, task, tasks):
    """Issue forward name lookup for a host name. Issues A requests for
    the original host name and all CNAMEs discovered. All successfully
    looked up addresses get their own reverse IP lookup process."""
    debug("IP2INFO", 2, "addr lookup %s %s %s",
          info.hostname, info.addrs, info.all_names)

    # Report ready if we already have a result.
    if info.addrs:
      return True

    # Do DNS forward lookup for hostname and all CNAMEs.
    def responder(answer, name):
      debug("IP2INFO", 2, "addr result %s from %s: %s", name, info.hostname, answer)
      self._tick_stats(answer[0])
      if name not in info.addrs:
        info.addrs[name] = []
      for ipstr in answer[3]:
        info.all_addrs.update((ipstr,))
        ip = IPAddress(ipstr)
        info.addrs[name].append(ip)
        self._insert_lookup(ip, info)
      task.done = (len(info.addrs) == len(info.all_names))
      task.ongoing = not task.done
      if task.done and answer[0] > adns.status.max_misconfig and not info.all_addrs:
        tasks.append(IPTask(self._notify_hosts, 1))

    for name in info.all_names:
      self._submit(name, rr.A, responder)
    return False
Exemple #13
0
  def scrape(self, section, url, images=None, match=None, period=900, urledit=None):
    """
    Register a HTML page to scrape for images matching a pattern.
    Images will be available for retrieval via `image()` using the
    names listed in `images`. If `images` and `match` are None, then
    scrapes images directly, otherwise scrapes images off an HTML
    page.

    :arg str section: identifier for images from this address
    :arg str url: html address where to retrieve page or image
    :arg callable urledit: dynamically modify url before request
    :arg list(str) images: list of image names
    :arg re match: regular expression to match image(s)
    :arg int period: interval in seconds between checks
    """
    debug(self._ID, 1, "%s: scrape %s, images %s match %s period %d",
          section, url, images, match, period)
    if match:
      ContentScraper.scrape(self, section, { "page": (url, urledit) },
                            period = period, content_type = "text/html",
                            convert = lambda t,c,v: \
                              self._images(t, v, match, images))
    else:
      ContentScraper.scrape(self, section[:-1], { section[-1]: (url, urledit) },
                            period = period, content_type = "image/*",
                            convert = self._pngdata)
Exemple #14
0
 def _reqinit(self, c, task):
   debug(self._ID, 2, "initialising request to %s (%s)",
         task.url, task.content_type)
   c.headers = []
   c.setopt(pycurl.URL, task.url)
   c.setopt(pycurl.HEADERFUNCTION, c.headers.append)
   c.setopt(pycurl.HTTPHEADER, ["Accept: %s" % task.content_type,
                                "Accept-Encoding: gzip, deflate"])
Exemple #15
0
 def responder(hostname, ip):
   debug("IP2INFO", 3, "cidr result %s -> %s", info.ip, hostname)
   task.ongoing = False
   if hostname != None:
     task.done = True
     info.cidrhost = hostname
     debug("IP2INFO", 2, "cidr hostname found %s: %s",
           info.ip, info.cidrhost)
Exemple #16
0
 def _reqinit(self, c, task):
     debug(self._ID, 2, "initialising request to %s (%s)", task.url,
           task.content_type)
     c.headers = []
     c.setopt(pycurl.URL, task.url)
     c.setopt(pycurl.HEADERFUNCTION, c.headers.append)
     c.setopt(pycurl.HTTPHEADER, [
         "Accept: %s" % task.content_type, "Accept-Encoding: gzip, deflate"
     ])
Exemple #17
0
 def _submit(self, addr, type, callback, *extra):
   """Submit a DNS query.
   @param addr -- the address to look up
   @param type -- request record type (adns.rr.*)
   @param callback -- function to call back if the query is answered
   @param extra -- additional arguments to `callback`.
   """
   debug("IP2INFO", 3, "submitting lookup of %s, type %d", addr, type)
   self.queries[self.res.submit(addr, type)] = \
       (time(), callback, (addr,) + extra)
Exemple #18
0
  def _asn_lookup_1(self, info, task, tasks):
    """Perform first step of ASN lookup, by checking reserved addresses."""
    debug("IP2INFO", 2, "asn lookup/reserved %s %s", info.ip, info.asn.asn)

    if info.asn == self.NULL_ASN:
      resv = self._reserved(info.ip)
      if resv:
        info.cidr = resv
        info.domain = str(resv)
        info.asn = self.asnmap["@%s" % info.domain]
    return True
Exemple #19
0
  def _geoip_lookup(self, info, task, tasks):
    """Perform GeoIP lookup for an IP address."""
    debug("IP2INFO", 2, "geoip lookup %s %s", info.ip, info.geoip)

    # Report ready if we already have a result.
    if info.geoip != GeoIPLookup.NULL_GEOIP:
      return True

    # Lookup GeoIP info.
    info.geoip = self.gip.lookup(info.ip)
    return True
Exemple #20
0
 def responder(hostname, ip):
   debug("IP2INFO", 2, "wild result %s -> %s -> %s %d",
         info.ip, ip, hostname, len(addrs))
   addrs.remove(ip)
   task.ongoing = (len(addrs) > 0)
   if hostname != None:
     task.done = True
     task.ongoing = False
     if not info.wildhost:
       info.wildhost = hostname
       debug("IP2INFO", 2, "wild hostname found %s: %s",
             info.ip, info.wildhost)
Exemple #21
0
 def _ptr_result(self, answer, addr, ip):
   """Respond to PTR query results."""
   debug("IP2INFO", 3, "ptr result %s %s %s", addr, ip, answer)
   self._tick_stats(answer[0])
   if answer[0] > adns.status.max_tempfail:
     # permanent failure, remember not to ask again
     debug("IP2INFO", 3, "blacklisting %s %s (%d)",
           ip, _adns_status_name_of(answer[0]), answer[0])
     self.ptrfail.add(ip)
   hostname = (len(answer[3]) > 0 and answer[3][0].lower()) or None
   for callback in self.ptrmap[ip]:
     callback(hostname, ip)
   del self.ptrmap[ip]
Exemple #22
0
 def _reqerror(self, c, task, errmsg, errno):
     result = getattr(task, "result", None)
     cherrypy.log(("CACHE ERROR %s request failed with error:"
                   " %s (code %d), headers %s") %
                  (getattr(task, "url", c.getinfo(
                      pycurl.EFFECTIVE_URL)), errmsg, errno, c.headers))
     if result:
         with result["signal"]:
             debug(self._ID, 2, "signaling error on %s, pending %d",
                   task.url, result["pending"])
             if not result["error"]:
                 result["error"] = RuntimeError("http error %s (code %d)" %
                                                (errmsg, errno))
             result["signal"].notifyAll()
Exemple #23
0
 def responder(answer, addr):
   debug("IP2INFO", 3, "cc result %s from %s: %s", addr, info.asn.asn, answer)
   self._tick_stats(answer[0])
   task.ongoing = False
   if len(answer[3]) > 0 and len(answer[3][0]) == 1:
     m = RX_ASN_CC.match(answer[3][0][0])
     if m and m.group(1) == info.asn.asn:
       debug("IP2INFO", 2, "cc assigning %s = %s", m.group(1), m.group(2))
       info.asn.cc = m.group(2)
       info.asn.rir = m.group(3)
       info.asn.date = m.group(4)
       info.asn.org = m.group(5)
       info.asn.desc = m.group(7)
       task.done = True
Exemple #24
0
 def _reqerror(self, c, task, errmsg, errno):
   result = getattr(task, "result", None)
   cherrypy.log(("CACHE ERROR %s request failed with error:"
                 " %s (code %d), headers %s") %
                (getattr(task, "url", c.getinfo(pycurl.EFFECTIVE_URL)),
                 errmsg, errno, c.headers))
   if result:
     with result["signal"]:
       debug(self._ID, 2, "signaling error on %s, pending %d",
             task.url, result["pending"])
       if not result["error"]:
         result["error"] = RuntimeError("http error %s (code %d)"
                                        % (errmsg, errno))
       result["signal"].notifyAll()
Exemple #25
0
 def responder(answer, name):
   debug("IP2INFO", 2, "addr result %s from %s: %s", name, info.hostname, answer)
   self._tick_stats(answer[0])
   if name not in info.addrs:
     info.addrs[name] = []
   for ipstr in answer[3]:
     info.all_addrs.update((ipstr,))
     ip = IPAddress(ipstr)
     info.addrs[name].append(ip)
     self._insert_lookup(ip, info)
   task.done = (len(info.addrs) == len(info.all_names))
   task.ongoing = not task.done
   if task.done and answer[0] > adns.status.max_misconfig and not info.all_addrs:
     tasks.append(IPTask(self._notify_hosts, 1))
Exemple #26
0
  def __init__(self, cachedir = None,
               gip = None, psl = None, res = None,
               maxtime = 30, maxtries = 3):
    """Constructor.

    Initialises the lookup object so it is ready for queries.

    @param cachedir -- Default location for caching databases, used if
    `gip` or `psl` have not been specified. If unset and neither `gip`
    nor `psl` arguments are provided, the databases are cached in the
    current directory.

    @param gip -- Reference to GeoIPLookup object. If None, a new
    object is created, using `cachedir` or current directory as the
    location for the city database file.

    @param psl -- Reference to PublicSuffixLookup object. If None, a
    new object is created, using `cachedir` or current directory as
    the location for the YAML cache database.

    @param res -- Reference to adns DNS resolver object. If None, a
    new resolver is created. If you want to use a nameserver other
    than your system default one, pass in custom adns object created
    with the appropriate "nameserver x.y.z.w" resolver argument.

    @param maxtime -- The maximum time to wait for DNS replies. Some
    DNS servers are slow to respond so some queries take a long time
    to complete, or will simply time out. If the client is submitting
    large numbers of addresses for query, the stragglers are handled
    automatically and there is no reason to reduce the query time-out.
    However if the client has just a few addresses to resolve, or is
    in a hurry to get the answer, set `maxtime` to some smaller value.

    @param maxtries -- The maximum number of times to attempt main
    queries per IP address. In general this value should be greater
    than one to avoid failures resulting from dropped DNS packets and
    to catch straggler responses from slow, far away and somewhat
    misconfigured DNS servers. More than three rarely improves the
    accuracy of the results.
    """
    now = time()
    self.maxtime = maxtime
    self.maxtries = maxtries
    geopath = (cachedir and "%s/GeoLiteCity.dat" % cachedir)
    pslpath = (cachedir and "%s/psl.yml" % cachedir)
    self.res = res or adns.init(adns.iflags.noautosys)
    debug("IP2INFO", 2, "dns resolver initialised %.2f", time() - now)
    self.gip = gip or GeoIPLookup(path = geopath)
    debug("IP2INFO", 2, "geoip resolver initialised %.2f", time() - now)
    self.psl = psl or PublicSuffixLookup(path = pslpath)
    debug("IP2INFO", 2, "domain resolver initialised %.2f", time() - now)
    self.ptrfail = set()
    self.ptrmap = {}
    self.asnmap = self._asninit()
    self.queries = {}
    self.ipaddrs = {}
    self.notify = {}
    self.resstat = {}
    debug("IP2INFO", 1, "initialisation complete %.2f", time() - now)
Exemple #27
0
 def responder(answer, addr):
   debug("IP2INFO", 2, "cname result %s from %s: %s",
         addr, info.hostname, answer)
   self._tick_stats(answer[0])
   task.done = True
   task.ongoing = False
   if addr not in info.cnames:
     info.cnames[addr] = []
   for cname in answer[3]:
     info.all_names.update((cname,))
     cnames = info.cnames[addr]
     cnames.append(cname)
     if len(cnames) < 5:
       self._submit(cname, rr.CNAME, responder)
       task.ongoing = True
       task.done = False
Exemple #28
0
 def _pngdata(self, task, c, imgdata):
   """Return image `data` as PNG image, using MIME type `format`.
   Returns `data` as is if `format` is image/png, otherwise converts
   the `data` into PNG format and returns that instead."""
   ctype = c.getinfo(pycurl.CONTENT_TYPE)
   if not (ctype and ctype.startswith("image/")):
     cherrypy.log("SCRAPER ERROR %s content type '%s' not an image, headers %s" %
                  (c.getinfo(pycurl.EFFECTIVE_URL), ctype, c.headers))
     return None
   elif ctype != 'image/png':
     debug(self._ID, 3, "%s: converting image %s to png", task.key, ctype)
     png = StringIO()
     PILImage.open(StringIO(imgdata)).save(png, "PNG")
     imgdata = png.getvalue()
     png.close()
   return imgdata
Exemple #29
0
  def _notify_hosts(self, info, task, tasks):
    """Notify completely looked up host objects."""
    assert isinstance(info, HostInfo)
    key = ("name", info.hostname)
    assert key in self.notify
    debug("IP2INFO", 2, "notify callbacks host %s %d %s %s",
          info.hostname, len(self.notify[key]), info.all_addrs, info.ipaddrs)
    assert len(info.all_addrs) == len(info.ipaddrs)
    debug("IP2INFO", 2, "notify callbacks host %s %d",
          info.hostname, len(self.notify[key]))

    callbacks = self.notify[key]
    del self.notify[key]
    for f in callbacks:
      f(info, None, 0)

    return True
Exemple #30
0
  def _timeout(self):
    """Cancel queries which have timed out after `self.maxtime`."""

    # Scan for queries which have been going on for too long.
    now = time()
    expired = []
    for q, info in self.queries.iteritems():
      if now > info[0] + self.maxtime:
        expired.append((q, info))

    # Now expire them. Call the callbacks so the tasks move on.
    debug("IP2INFO", 3, "cancelling %d timed out queries", len(expired))
    for q, info in expired:
      (created, callback, args) = info
      del self.queries[q]
      q.cancel()
      callback((-1, None, None, tuple()), *args)
Exemple #31
0
  def _asn_lookup_3(self, info, task, tasks):
    """Perform third step of AS lookups for IP addresses by using
    cymru.com reverse mapping DNS servers."""
    debug("IP2INFO", 2, "asn lookup/cymru %s %s", info.ip, info.asn.asn)

    # Report ready if we already have a result.
    if info.asn != self.NULL_ASN:
      return True

    # Define responder to cymru.com ASN lookup query.  Expects 1-tuple
    # "ASN | CIDR | CC | RIR | YYYY-MM-DD" answer. Keeps the last
    # record of the answer received, it's the most specific CIDR. If
    # this creates ASInfo it will request ASN cc lookup too.
    def responder(answer, addr):
      debug("IP2INFO", 3, "cymru result %s from %s: %s", addr, info.ip, answer)
      self._tick_stats(answer[0])
      task.ongoing = False
      if len(answer[3]) > 0 and len(answer[3][-1]) == 1:
        m = RX_ASN.match(answer[3][-1][0])
        if m:
          task.done = True
          if m.group(1) in self.asnmap:
            debug("IP2INFO", 3, "cymru existing asn %s", m.group(1))
            info.asn = self.asnmap[m.group(1)]
          else:
            debug("IP2INFO", 2, "cymru new asn %s, cidr %s, cc %s",
                  m.group(1), m.group(2), m.group(3))
            tasks.insert(1, IPTask(self._asn_lookup_cc, 2))
            info.asn = self.asnmap[m.group(1)] = \
                ASInfo(asn = m.group(1),
                       cc = m.group(3),
                       rir = m.group(4),
                       date = m.group(5))

    # Do reverse TXT lookup on IP address from cymru.com DNS.
    revaddr = info.ip.reverse_dns
    if revaddr.endswith(".in-addr.arpa."):
      rev = revaddr[:-14] + ".origin.asn.cymru.com"
    elif revaddr.endswith(".ip6.arpa."):
      rev = revaddr[:-10] + ".origin6.asn.cymru.com"
    else:
      assert False, "reverse address %s not recognised" % revaddr
    debug("IP2INFO", 3, "submitting asn lookup %s", rev)
    self._submit(rev, rr.TXT, responder)
    return False
Exemple #32
0
 def _pngdata(self, task, c, imgdata):
     """Return image `data` as PNG image, using MIME type `format`.
 Returns `data` as is if `format` is image/png, otherwise converts
 the `data` into PNG format and returns that instead."""
     ctype = c.getinfo(pycurl.CONTENT_TYPE)
     if not (ctype and ctype.startswith("image/")):
         cherrypy.log(
             "SCRAPER ERROR %s content type '%s' not an image, headers %s" %
             (c.getinfo(pycurl.EFFECTIVE_URL), ctype, c.headers))
         return None
     elif ctype != 'image/png':
         debug(self._ID, 3, "%s: converting image %s to png", task.key,
               ctype)
         png = StringIO()
         PILImage.open(StringIO(imgdata)).save(png, "PNG")
         imgdata = png.getvalue()
         png.close()
     return imgdata
Exemple #33
0
    def __init__(self, appconfig):
        debug(self._ID, 1, "creating new content cache")
        Thread.__init__(self, name="ContentCache")
        self._ssl = SSLOptions(key_file=appconfig.x509key,
                               cert_file=appconfig.x509cert,
                               ca_path=appconfig.x509cadir)

        self._reqman = RequestManager(num_connections=10,
                                      ssl_opts=self._ssl,
                                      user_agent=self._ident,
                                      handle_init=self._hinit,
                                      request_init=self._reqinit,
                                      request_error=self._reqerror,
                                      request_respond=self._reqdone)
        self._cv = Condition()
        self._stopme = False
        self._values = {}
        cherrypy.engine.subscribe('start', self.start)
        cherrypy.engine.subscribe('stop', self.stop, priority=100)
Exemple #34
0
  def _hostname_lookup(self, info, task, tasks):
    """Issue reverse name lookup for an IP address."""
    debug("IP2INFO", 2, "hostname lookup %s %s %s",
          info.ip, info.hostname, info.domain)

    # Report ready if we already have a result.
    if info.hostname or info.domain:
      return True

    # Do DNS reverse hostname lookup.
    def responder(hostname, ip):
      debug("IP2INFO", 2, "hostname %s -> %s", info.ip, hostname)
      task.ongoing = False
      if hostname != None:
        info.hostname = hostname
        task.done = True

    self._submit_ptr(responder, info.ip, rr.PTRraw)
    return False
Exemple #35
0
  def _domain_lookup(self, info, task, tasks):
    """Look up domain part based on whatever name we managed to get."""
    debug("IP2INFO", 2, "domain lookup %s %s %s",
          info.ip, info.hostname, info.domain)

    if not info.domain:
      if info.hostname:
        info.domain = self.psl.domain(info.hostname)
      elif info.cidrhost:
        info.domain = self.psl.domain(info.cidrhost)
      elif info.wildhost:
        info.domain = self.psl.domain(info.wildhost)
      elif info.asn and info.asn.asn:
        info.domain = "AS#%s (%s)" % (info.asn.asn, info.asn.org)

    if not info.hostname:
      info.hostname = str(info.ip)

    return True
Exemple #36
0
  def __init__(self, appconfig):
    debug(self._ID, 1, "creating new content cache")
    Thread.__init__(self, name = "ContentCache")
    self._ssl = SSLOptions(key_file = appconfig.x509key,
                           cert_file = appconfig.x509cert,
                           ca_path = appconfig.x509cadir)

    self._reqman = RequestManager(num_connections = 10,
                                  ssl_opts = self._ssl,
                                  user_agent = self._ident,
                                  handle_init = self._hinit,
                                  request_init = self._reqinit,
                                  request_error = self._reqerror,
                                  request_respond = self._reqdone)
    self._cv = Condition()
    self._stopme = False
    self._values = {}
    cherrypy.engine.subscribe('start', self.start)
    cherrypy.engine.subscribe('stop', self.stop, priority=100)
Exemple #37
0
  def purge(self):
    """Purge cached information and reload databases if possible."""
    now = time()
    if self.queries:
      return

    for _, tasks in self.ipaddrs.values():
      if tasks:
        return

    assert not self.ptrmap
    assert not self.queries
    assert not self.notify
    self.ptrfail = set()
    self.asnmap = self._asninit()
    self.ipaddrs = {}
    self.resstat = {}
    self.gip.reload()
    self.psl.reload()
    debug("IP2INFO", 1, "reload complete %.2f", time() - now)
Exemple #38
0
 def __call__(self, info, origin, remain):
     debug("HOSTDATA", 3, "replied to %s", self)
     if self.kind == "name":
         if isinstance(info, HostInfo):
             if remain:
                 debug(
                     "HOSTDATA", 2,
                     "host %s: %d out of %d host addresses, waiting for remaining %d",
                     info.hostname, len(info.ipaddrs), len(info.all_addrs),
                     remain)
             else:
                 assert info.hostname in self.pending
                 self.pending.remove(info.hostname)
                 debug(
                     "HOSTDATA", 1,
                     "host %s: all %d addresses resolved, %d requests remain",
                     info.hostname, len(info.ipaddrs), len(self.pending))
                 with self.signal:
                     if not self.result:
                         self.result = []
                     self.result.append(self._hostinfo(info))
                     if not self.pending:
                         self.signal.notifyAll()
         else:
             debug("HOSTDATA", 1, "%s: ignoring address update for %s",
                   (origin and origin.hostname), info.ip)
     elif self.kind == "ip":
         assert isinstance(info, IPInfo)
         assert info.ip in self.pending
         assert not remain
         self.pending.remove(info.ip)
         debug("HOSTDATA", 1, "ip %s: address resolved, %d requests remain",
               info.ip, len(self.pending))
         with self.signal:
             if not self.result:
                 self.result = []
             self.result.append(self._ipinfo(info))
             if not self.pending:
                 self.signal.notifyAll()
     else:
         assert False, "internal error, lookup neither host nor ip"
Exemple #39
0
    def run(self):
        with self._cv:
            while not self._stopme:
                npending = 0
                ncurreq = len(self._requests)

                # Insert any new requests. If they fail, remember the error.
                for r in self._requests:
                    if not r.reply.submitted:
                        debug("HOSTDATA", 1, "submitting request: %s %s",
                              r.kind, r.hosts)
                        r.reply.submitted = True
                        try:
                            self._ip2i.submit(r.hosts,
                                              kind=r.kind,
                                              callback=r.reply)
                        except Exception, e:
                            r.reply.error = e

                # Pump any pending lookups for up to .25 seconds. Note that this
                # will wait only as long as needed, and will quit immediately
                # if there is no work at all. It's not unusual we need to wait
                # longer than this for final results; see the check further on.
                try:
                    self._cv.release()
                    npending = self._ip2i.process(.25)
                finally:
                    self._cv.acquire()

                # Post-process requests. Remove fully completed, expired and
                # failed lookups from the request queue.
                nmodified = 0
                now = time.time()
                for r in self._requests[:]:
                    rr = r.reply
                    if rr.finished:
                        debug("HOSTDATA", 2, "request completed: %s %s",
                              r.kind, r.hosts)
                        self._requests.remove(r)
                        nmodified += 1
                    elif rr.submitted and rr.until < now:
                        debug("HOSTDATA", 1, "request has expired: %s %s",
                              r.kind, r.hosts)
                        self._requests.remove(r)
                        with rr.signal:
                            rr.error = RuntimeError(
                                "maximum wait time exhausted")
                            rr.signal.notifyAll()
                        nmodified += 1
                    elif rr.submitted and rr.error:
                        debug("HOSTDATA", 1, "request failed: %s %s", r.kind,
                              r.hosts)
                        self._requests.remove(r)
                        with rr.signal:
                            rr.signal.notifyAll()
                        nmodified += 1

                # Wait to be notified, but only if we don't already have work to do.
                skipwait = (self._stopme or npending or nmodified
                            or len(self._requests) != ncurreq)
                debug("HOSTDATA", 2,
                      ("wait for signal, %d pending, %d requests"
                       " now vs. %d before, %d modified: %s"), npending,
                      len(self._requests), ncurreq, nmodified,
                      (skipwait and "skipping unnecessary wait") or "waiting")
                if not skipwait:
                    if now - self._last_purge > self._PURGE_INTERVAL:
                        self._purge()
                    self._cv.wait((self._requests and 0.25) or None)
                    debug("HOSTDATA", 2, "wait done")
Exemple #40
0
 def stop(self):
     debug("HOSTDATA", 1, "requesting to stop resolved thread")
     with self._cv:
         self._stopme = True
         self._cv.notifyAll()
Exemple #41
0
 def _purge(self):
     now = time.time()
     debug("HOSTDATA", 1, "purging address resolver")
     self._last_purge = time.time()
     self._ip2i.purge()
Exemple #42
0
class HostCache(Thread):
    """Utility to resolve host information."""
    _PURGE_INTERVAL = 4 * 3600
    _NUM_SIGS = 8

    def __init__(self, statedir):
        Thread.__init__(self, name="HostCache")
        self._ip2i = IPResolver(cachedir=statedir, maxtime=15)
        self._cv = Condition()
        self._stopme = False
        self._requests = []
        self._last_purge = time.time()
        self._signals = map(lambda x: Condition(), xrange(0, self._NUM_SIGS))
        cherrypy.engine.subscribe('start', self.start)
        cherrypy.engine.subscribe('stop', self.stop, priority=100)

    def _purge(self):
        now = time.time()
        debug("HOSTDATA", 1, "purging address resolver")
        self._last_purge = time.time()
        self._ip2i.purge()

    def statistics(self):
        with self._cv:
            return self._ip2i.statistics()

    def reset_statistics(self):
        with self._cv:
            self._ip2i.reset_statistics()

    def purge(self):
        with self._cv:
            self._purge()

    def stop(self):
        debug("HOSTDATA", 1, "requesting to stop resolved thread")
        with self._cv:
            self._stopme = True
            self._cv.notifyAll()

    def lookup(self, kind, hosts, maxwait=30):
        """
    Lookup information either by IP address or host name.

    :arg str kind: "ip" or "name"
    :arg list hosts: list of host name string, ip address or a real name
    :arg float maxwait: maximum time in seconds to wait for a result.
    """
        reply = Reply()
        reply.kind = kind
        reply.until = time.time() + maxwait
        reply.signal = random.choice(self._signals)
        reply.pending = set(hosts)

        with self._cv:
            self._requests.append(Task(kind, hosts, reply))
            self._cv.notifyAll()

        with reply.signal:
            while True:
                if self._stopme:
                    raise RuntimeError("server stopped")
                elif reply.error:
                    raise reply.error
                elif not reply.pending:
                    reply.finished = True
                    return reply.result
                else:
                    reply.signal.wait()

    def run(self):
        with self._cv:
            while not self._stopme:
                npending = 0
                ncurreq = len(self._requests)

                # Insert any new requests. If they fail, remember the error.
                for r in self._requests:
                    if not r.reply.submitted:
                        debug("HOSTDATA", 1, "submitting request: %s %s",
                              r.kind, r.hosts)
                        r.reply.submitted = True
                        try:
                            self._ip2i.submit(r.hosts,
                                              kind=r.kind,
                                              callback=r.reply)
                        except Exception, e:
                            r.reply.error = e

                # Pump any pending lookups for up to .25 seconds. Note that this
                # will wait only as long as needed, and will quit immediately
                # if there is no work at all. It's not unusual we need to wait
                # longer than this for final results; see the check further on.
                try:
                    self._cv.release()
                    npending = self._ip2i.process(.25)
                finally:
                    self._cv.acquire()

                # Post-process requests. Remove fully completed, expired and
                # failed lookups from the request queue.
                nmodified = 0
                now = time.time()
                for r in self._requests[:]:
                    rr = r.reply
                    if rr.finished:
                        debug("HOSTDATA", 2, "request completed: %s %s",
                              r.kind, r.hosts)
                        self._requests.remove(r)
                        nmodified += 1
                    elif rr.submitted and rr.until < now:
                        debug("HOSTDATA", 1, "request has expired: %s %s",
                              r.kind, r.hosts)
                        self._requests.remove(r)
                        with rr.signal:
                            rr.error = RuntimeError(
                                "maximum wait time exhausted")
                            rr.signal.notifyAll()
                        nmodified += 1
                    elif rr.submitted and rr.error:
                        debug("HOSTDATA", 1, "request failed: %s %s", r.kind,
                              r.hosts)
                        self._requests.remove(r)
                        with rr.signal:
                            rr.signal.notifyAll()
                        nmodified += 1

                # Wait to be notified, but only if we don't already have work to do.
                skipwait = (self._stopme or npending or nmodified
                            or len(self._requests) != ncurreq)
                debug("HOSTDATA", 2,
                      ("wait for signal, %d pending, %d requests"
                       " now vs. %d before, %d modified: %s"), npending,
                      len(self._requests), ncurreq, nmodified,
                      (skipwait and "skipping unnecessary wait") or "waiting")
                if not skipwait:
                    if now - self._last_purge > self._PURGE_INTERVAL:
                        self._purge()
                    self._cv.wait((self._requests and 0.25) or None)
                    debug("HOSTDATA", 2, "wait done")

        debug("HOSTDATA", 1, "server thread stopped")
Exemple #43
0
    def run(self):
        with self._cv:
            while not self._stopme:
                debug(self._ID, 1, "executing scrape cycle")
                now = time.time()
                for s in self._scrape:
                    for title, url in s["urls"].iteritems():
                        key = s["section"] + (title, )
                        debug(self._ID, 3, "%s: considering %s", key, url)
                        _, val = self._get(key)
                        if val.expires < now:
                            if isinstance(url, tuple):
                                url, urledit = url
                                if urledit:
                                    url = urledit(url)
                            debug(self._ID, 2,
                                  "%s: refetching expired %s (%.2f ago)", key,
                                  url, now - val.expires)
                            self._reqman.put(
                                Task(url, key, s["period"], s["content_type"],
                                     s["convert"], None))

                debug(self._ID, 1, "processing requests")
                self._reqman.process(lock=self._cv.acquire,
                                     unlock=self._cv.release)

                debug(self._ID, 1, "waiting")
                if not self._stopme:
                    self._cv.wait(30)
                debug(self._ID, 1, "wait done")

        debug(self._ID, 1, "server thread stopped")
Exemple #44
0
 def __init__(self, appconfig):
     debug(self._ID, 1, "creating new content scraper")
     ContentCache.__init__(self, appconfig)
     self._scrape = []
Exemple #45
0
    def _reqdone(self, c):
        result = c.task.result

        try:
            code = c.getinfo(pycurl.HTTP_CODE)
            debug(self._ID, 2, "request done %s => http %d", c.task.url, code)
            if code != 200:
                raise RuntimeError("http response %d from %s" %
                                   (code, c.task.url))

            value = c.buffer.getvalue()
            for h in c.headers:
                m = RX_CONTENT_ENCODING.match(h)
                if m:
                    enc = m.group(1)
                    if enc == "deflate":
                        debug(self._ID, 3, "decompressing deflated content")
                        value = zlib.decompress(value, -zlib.MAX_WBITS)
                    elif enc == "gzip":
                        debug(self._ID, 3, "decompressing gzipped content")
                        value = gzip.GzipFile(fileobj=StringIO(value)).read()
                    else:
                        cherrypy.log(
                            "WARNING: ignoring content encoding %s for %s" %
                            (enc, c.task.url))

            if c.task.convert:
                debug(self._ID, 3, "converting value for %s, len %d",
                      c.task.url, len(value))
                value = c.task.convert(c.task, c, value)

            if value:
                debug(self._ID, 1, "storing value for %s into %s, expires %d",
                      c.task.url, c.task.key, c.task.period)
                self._store(c.task, value)

            if result:
                with result["signal"]:
                    debug(self._ID, 2, "signaling result on %s, pending %d",
                          c.task.url, result["pending"])
                    assert result["pending"] > 0
                    result["pending"] -= 1
                    if result["pending"] == 0:
                        result["signal"].notifyAll()
        except Exception, e:
            cherrypy.log(("CACHE ERROR %s processing failed with error:"
                          " %s, headers %s") % (c.task.url, str(e), c.headers))
            for line in traceback.format_exc().rstrip().split("\n"):
                cherrypy.log("  " + line)
            if result:
                with result["signal"]:
                    debug(self._ID, 2, "signaling error on %s, pending %d",
                          c.task.url, result["pending"])
                    if not result["error"]:
                        result["error"] = e
                    result["signal"].notifyAll()
Exemple #46
0
    def run(self):
        with self._cv:
            while not self._stopme:
                debug(self._ID, 1, "processing requests")
                self._reqman.process(lock=self._cv.acquire,
                                     unlock=self._cv.release)

                debug(self._ID, 1, "purging values")
                self._purge(time.time(), None, None, self._values)

                debug(self._ID, 1, "waiting")
                if not self._stopme:
                    self._cv.wait()
                debug(self._ID, 1, "wait done")

        debug(self._ID, 1, "server thread stopped, waking waiters")
        for s in self._signals:
            with s:
                s.notifyAll()
        debug(self._ID, 1, "server thread stopped")
Exemple #47
0
 def __init__(self, appconfig):
     debug(self._ID, 1, "creating new content proxy")
     ContentCache.__init__(self, appconfig)
     self._signals = map(lambda x: Condition(), xrange(0, self._NUM_SIGS))
Exemple #48
0
    def fetch(self,
              section,
              expires,
              urls,
              content_type="application/json",
              convert=None,
              merge=None):
        """
    Retrieve data from URLs, caching it locally for `expires` seconds. Usually
    the content is JSON but it can be something else too, like HTML. All the
    URLs will be fetched, converted using `convert`, stored, then merged to a
    new value with `merge`.

    :arg str section: label for this item
    :arg int expires: maximum time to cache the responses
    :arg str content_type: expected content type in response
    :arg callable convert: response conversion, e.g. cjson.decode
    :arg callable merge: reply post-processor
    :arg dict urls: (title, url) or (title, (url, urledit)) of data to retrieve
    """
        debug(self._ID, 1, "%s: fetch from %s, expires %d, content type %s",
              section, urls, expires, content_type)
        if len(urls) > 1 and not merge:
            raise ValueError("merge needed to reduce %s from %s" %
                             (section, urls))

        if not merge:
            merge = lambda group: group[urls.keys()[0]].data

        if isinstance(section, basestring):
            section = (section, )

        now = time.time()
        merged = section + ("merged", )
        signal = self._signals[(hash(merged) >> 24) % self._NUM_SIGS]
        reply = {"pending": 0, "error": None, "signal": signal}

        with self._cv:
            if not self._has(merged):
                debug(self._ID, 2, "%s: inserting null value", merged)
                self._put(merged, 0, None)

            for title, url in urls.iteritems():
                key = section + (title, )
                if self._has(key):
                    _, val = self._get(key)
                    if val.expires >= now:
                        debug(self._ID, 2, "%s: valid value for %s", key, url)
                        continue
                else:
                    debug(self._ID, 2, "%s: inserting null value for %s", key,
                          url)
                    self._put(key, 0, None)

                if isinstance(url, tuple):
                    url, urledit = url
                    if urledit:
                        url = urledit(url)

                reply["pending"] += 1
                self._reqman.put(
                    Task(url, key, expires, content_type, convert, reply))
                debug(self._ID, 2, "%s: requested %s", key, url)

            if reply["pending"]:
                debug(self._ID, 3, "%s: signaling requests", section)
                self._cv.notifyAll()

        with signal:
            while True:
                if self._stopme:
                    debug(self._ID, 3, "%s: reply cancelled for stop", merged)
                    raise RuntimeError("server stopped")
                elif reply["error"]:
                    debug(self._ID, 2, "%s: reply was an error", merged)
                    raise reply["error"]
                elif not reply["pending"]:
                    debug(self._ID, 2, "%s: reply complete", merged)
                    break
                else:
                    debug(self._ID, 3, "%s: waiting for reply", merged)
                    signal.wait()

        with self._cv:
            newval = None
            now = time.time()
            if not self._has(merged):
                # unlikely but possible it got removed
                debug(self._ID, 2, "%s: replacing lost key", merged)
                self._put(merged, 0, None)
            group, val = self._get(merged)
            if val.expires >= now:
                debug(self._ID, 1, "%s: returning valid value", merged)
                return val.data
            else:
                debug(self._ID, 2, "%s: merging new value", merged)
                newval = merge(group)
                self._put(merged, now + expires, newval)
                return newval