Esempio n. 1
0
def Scrape(browsers,
           urls,
           window_size=(1024, 768),
           window_pos=(0, 0),
           timeout=20,
           save_path=None,
           **kwargs):
    """Invoke one or more browsers over one or more URLs, scraping renders.

  Args:
    browsers: browsers to invoke with optional version strings
    urls: URLs to visit
    window_size: size of the browser window to display
    window_pos: location of browser window
    timeout: time (in seconds) to wait for page to load
    save_path: root of save path, automatically appended with browser and
      version
    kwargs: miscellaneous keyword args, passed to scraper
  Returns:
    None

  @TODO(jhaas): more parameters, or perhaps an indefinite dictionary
  parameter, for things like length of time to wait for timeout, speed
  of mouse clicks, etc. Possibly on a per-browser, per-URL, or
  per-browser-per-URL basis
  """

    if type(browsers) in types.StringTypes: browsers = [browsers]

    if save_path is None:
        # default save path is "scrapes" off the current root
        save_path = os.path.join(os.path.split(__file__)[0], "Scrapes")

    for browser in browsers:
        # Browsers should be tuples of (browser, version)
        if type(browser) in types.StringTypes: browser = (browser, None)
        scraper = scrapers.GetScraper(browser)

        full_path = os.path.join(save_path, browser[0], scraper.version)
        drivers.windowing.PreparePath(full_path)

        scraper.Scrape(urls, full_path, window_size, window_pos, timeout,
                       kwargs)
Esempio n. 2
0
def ExecuteMaskmaker(command):
  """Performs automatic mask generation."""

  # Get the list of URLs to generate masks for
  class MaskmakerURL(object):
    """Helper class for holding information about a URL passed to maskmaker."""
    __slots__ = ['url', 'consecutive_successes', 'errors']
    def __init__(self, url):
      self.url = url
      self.consecutive_successes = 0
      self.errors = 0

  if command["--url"]:
    url_list = [MaskmakerURL(command["--url"])]
  else:
    startline = command["--startline"]
    if command["--count"]:
      endline = startline+command["--count"]
    else:
      endline = command["--endline"]
    url_list = [MaskmakerURL(url.strip()) for url in
                open(command["--list"], "r").readlines()[startline:endline]]

  complete_list = []
  error_list = []

  outdir = command["--outdir"]
  scrapes = command["--scrapes"]
  errors = command["--errors"]
  size = command["--size"]
  scrape_pass = 0

  scrapedir = command["--scrapedir"]
  if not scrapedir: scrapedir = tempfile.gettempdir()

  # Get the scraper
  scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))

  # Repeatedly iterate through the list of URLs until either every URL has
  # a successful mask or too many errors, or we've exceeded the giveup limit
  while url_list and scrape_pass < command["--giveup"]:
    # Scrape each URL
    for url in url_list:
      print "Processing %r..." % url.url
      mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")

      # Load the existing mask. This is in a loop so we can try to recover
      # from error conditions
      while True:
        try:
          mask = Image.open(mask_filename)
          if mask.size != size:
            print "  %r already exists and is the wrong size! (%r vs %r)" % (
              mask_filename, mask.size, size)
            mask_filename = "%s_%r%s" % (
              mask_filename[:-4], size, mask_filename[-4:])
            print "  Trying again as %r..." % mask_filename
            continue
          break
        except IOError:
          print "  %r does not exist, creating" % mask_filename
          mask = Image.new("1", size, 1)
          mask.save(mask_filename)

      # Find the stored scrape path
      mask_scrape_dir = os.path.join(
        scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
      drivers.windowing.PreparePath(mask_scrape_dir)

      # Find the baseline image
      mask_scrapes = os.listdir(mask_scrape_dir)
      mask_scrapes.sort()

      if not mask_scrapes:
        print "  No baseline image found, mask will not be updated"
        baseline = None
      else:
        baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))

      mask_scrape_filename = os.path.join(mask_scrape_dir,
                                          time.strftime("%y%m%d-%H%M%S.bmp"))

      # Do the scrape
      result = scraper.Scrape(
        [url.url], mask_scrape_dir, size, (0, 0),
        command["--timeout"], path=command["--browserpath"],
        filename=mask_scrape_filename)

      if result:
        # Return value other than None means an error
        print "  Scrape failed with error '%r'" % result
        url.errors += 1
        if url.errors >= errors:
          print "  ** Exceeded maximum error count for this URL, giving up"
        continue

      # Load the new scrape
      scrape = Image.open(mask_scrape_filename)

      # Calculate the difference between the new scrape and the baseline,
      # subject to the current mask
      if baseline:
        diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
                                   mask.convert(scrape.mode))

        # If the difference is none, there's nothing to update
        if max(diff.getextrema()) == (0, 0):
          print "  Scrape identical to baseline, no change in mask"
          url.consecutive_successes += 1
          if url.consecutive_successes >= scrapes:
            print "  ** No change for %r scrapes, done!" % scrapes
        else:
          # convert the difference to black and white, then change all
          # black pixels (where the scrape and the baseline were identical)
          # to white, all others (where the scrape and the baseline differed)
          # to black.
          #
          # Since the below command is a little unclear, here's how it works.
          #    1. convert("L") converts the RGB image to grayscale
          #    2. point() maps grayscale values (or the individual channels)
          #       of an RGB image) to different ones. Because it operates on
          #       individual channels, the grayscale conversion from step 1
          #       is necessary.
          #    3. The "1" second parameter to point() outputs the result as
          #       a monochrome bitmap. If the original RGB image were converted
          #       directly to monochrome, PIL would dither it.
          diff = diff.convert("L").point([255]+[0]*255, "1")

          # count the number of different pixels
          diff_pixels = diff.getcolors()[0][0]

          # is this too much?
          diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
          if diff_pixel_percent > command["--threshhold"]:
            print ("  Scrape differed from baseline by %.2f percent, ignoring"
                   % diff_pixel_percent)
          else:
            print "  Scrape differed in %d pixels, updating mask" % diff_pixels
            mask = ImageChops.multiply(mask, diff)
            mask.save(mask_filename)

            # reset the number of consecutive "good" scrapes
            url.consecutive_successes = 0

    # Remove URLs whose mask is deemed done
    complete_list.extend(
      [url for url in url_list if url.consecutive_successes >= scrapes])
    error_list.extend(
      [url for url in url_list if url.errors >= errors])
    url_list = [
      url for url in url_list if
      url.consecutive_successes < scrapes and
      url.errors < errors]

    scrape_pass += 1
    print "**Done with scrape pass %d\n" % scrape_pass

    if scrape_pass >= command["--giveup"]:
      print "**Exceeded giveup threshhold. Giving up."
    else:
      print "Waiting %d seconds..." % command["--wait"]
      time.sleep(command["--wait"])

  print
  print "*** MASKMAKER COMPLETE ***"
  print "Summary report:"
  print "  %d masks successfully generated" % len(complete_list)
  for url in complete_list:
    print "    ", url.url
  print "  %d masks failed with too many errors" % len(error_list)
  for url in error_list:
    print "    ", url.url
  if scrape_pass >= command["--giveup"]:
    print ("  %d masks were not completed before "
           "reaching the giveup threshhold" % len(url_list))
    for url in url_list:
      print "    ", url.url
Esempio n. 3
0
def Iterate(command, iteration_func):
    """Iterates over a list of URLs, calling a function on each.

  Args:
    command: the command line containing the iteration flags
    iteration_func: called for each URL with (proc, wnd, url, result)
  """

    # Retrieve the browser scraper to use to invoke the browser
    scraper = scrapers.GetScraper(
        (command["--browser"], command["--browserver"]))

    def AttachToBrowser(path, timeout):
        """Invoke the browser process and connect to the socket."""
        (proc, frame, wnd) = scraper.GetBrowser(path)

        if not wnd: raise ValueError("Could not invoke browser.")

        # Try to connect the socket. If it fails, wait and try
        # again. Do this for ten seconds
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM,
                          socket.IPPROTO_TCP)

        for attempt in xrange(10):
            try:
                s.connect(("localhost", PORT))
            except socket.error:
                time.sleep(1)
                continue
            break

        try:
            s.getpeername()
        except socket.error:
            raise ValueError("Could not connect to browser")

        if command["--size"]:
            # Resize and reposition the frame
            windowing.MoveAndSizeWindow(frame, (0, 0), command["--size"], wnd)

        s.settimeout(timeout)

        Iterate.proc = proc
        Iterate.wnd = wnd
        Iterate.s = s

    def DetachFromBrowser():
        """Close the socket and kill the process if necessary."""
        if Iterate.s:
            Iterate.s.close()
            Iterate.s = None

        if Iterate.proc:
            if not windowing.WaitForProcessExit(Iterate.proc, 0):
                try:
                    windowing.EndProcess(Iterate.proc)
                    windowing.WaitForProcessExit(Iterate.proc, 0)
                except pywintypes.error:
                    # Exception here most likely means the process died on its own
                    pass
            Iterate.proc = None

    if command["--browserpath"]:
        browser = command["--browserpath"]
    else:
        browser = None

    # Read the URLs from the file
    if command["--url"]:
        url_list = [command["--url"]]
    else:
        startline = command["--startline"]
        if command["--count"]:
            endline = startline + command["--count"]
        else:
            endline = command["--endline"]

        url_list = []
        file = open(command["--list"], "r")

        for line in xrange(startline - 1):
            file.readline()

        for line in xrange(endline - startline):
            url_list.append(file.readline().strip())

    timeout = command["--timeout"]

    # Loop through the URLs and send them through the socket
    Iterate.s = None
    Iterate.proc = None
    Iterate.wnd = None

    for url in url_list:
        # Invoke the browser if necessary
        if not Iterate.proc:
            AttachToBrowser(browser, timeout)
        # Send the URL and wait for a response
        Iterate.s.send(url + "\n")

        response = ""

        while (response.find("\n") < 0):

            try:
                recv = Iterate.s.recv(MAX_URL)
                response = response + recv

                # Workaround for an oddity: when Firefox closes
                # gracefully, somehow Python doesn't detect it.
                # (Telnet does)
                if not recv:
                    raise socket.error

            except socket.timeout:
                response = url + ",hang\n"
                DetachFromBrowser()
            except socket.error:
                # If there was a socket error, it's probably a crash
                response = url + ",crash\n"
                DetachFromBrowser()

            # If we received a timeout response, restart the browser
            if response[-9:] == ",timeout\n":
                DetachFromBrowser()

            # Invoke the iteration function
            iteration_func(url, Iterate.proc, Iterate.wnd, response)

    # We're done
    DetachFromBrowser()
Esempio n. 4
0
def ExecuteCompare2(command):
  """Executes the Compare2 command."""
  if command["--url"]:
    url_list = [command["--url"]]
  else:
    startline = command["--startline"]
    if command["--count"]:
      endline = startline+command["--count"]
    else:
      endline = command["--endline"]
    url_list = [url.strip() for url in
                open(command["--list"], "r").readlines()[startline:endline]]

  log_file = open(command["--logfile"], "w")

  outdir = command["--outdir"]
  if not outdir: outdir = tempfile.gettempdir()

  scrape_info_list = []

  class ScrapeInfo(object):
    """Helper class to hold information about a scrape."""
    __slots__ = ["browser_path", "scraper", "outdir", "result"]

  for index in xrange(1, 3):
    scrape_info = ScrapeInfo()
    scrape_info.browser_path = command["--browser%d" % index]
    scrape_info.scraper = scrapers.GetScraper(
      (command["--browser"], command["--browser%dver" % index]))

    if command["--browser%dname" % index]:
      scrape_info.outdir = os.path.join(outdir,
                                        command["--browser%dname" % index])
    else:
      scrape_info.outdir = os.path.join(outdir, str(index))

    drivers.windowing.PreparePath(scrape_info.outdir)
    scrape_info_list.append(scrape_info)

  compare = operators.GetOperator("equals_with_mask")

  for url in url_list:
    success = True

    for scrape_info in scrape_info_list:
      scrape_info.result = scrape_info.scraper.Scrape(
        [url], scrape_info.outdir, command["--size"], (0, 0),
        command["--timeout"], path=scrape_info.browser_path)

      if not scrape_info.result:
        scrape_info.result = "success"
      else:
        success = False

    result = "unknown"

    if success:
      result = "equal"

      file1 = drivers.windowing.URLtoFilename(
        url, scrape_info_list[0].outdir, ".bmp")
      file2 = drivers.windowing.URLtoFilename(
        url, scrape_info_list[1].outdir, ".bmp")

      comparison_result = compare.Compare(file1, file2,
                                          maskdir=command["--maskdir"])

      if comparison_result is not None:
        result = "not-equal"

        if command["--diffdir"]:
          comparison_result[1].save(
            drivers.windowing.URLtoFilename(url, command["--diffdir"], ".bmp"))

    # TODO(jhaas): maybe use the logging module rather than raw file writes
    log_file.write("%s %s %s %s\n" % (url,
                                      scrape_info_list[0].result,
                                      scrape_info_list[1].result,
                                      result))
Esempio n. 5
0
def ExecuteTimeLoad(command):
  """Executes the TimeLoad command."""
  browsers = command["--browsers"].split(",")
  num_browsers = len(browsers)

  if command["--browserversions"]:
    browser_versions = command["--browserversions"].split(",")
  else:
    browser_versions = [None] * num_browsers

  if command["--browserpaths"]:
    browser_paths = command["--browserpaths"].split(",")
  else:
    browser_paths = [None] * num_browsers

  if len(browser_versions) != num_browsers:
    raise ValueError(
      "--browserversions must be same length as --browser_paths")
  if len(browser_paths) != num_browsers:
    raise ValueError(
      "--browserversions must be same length as --browser_paths")

  if [b for b in browsers if b not in ["chrome", "ie", "firefox"]]:
    raise ValueError("unknown browsers: %r" % b)

  scraper_list = []

  for b in xrange(num_browsers):
    version = browser_versions[b]
    if not version: version = None

    scraper = scrapers.GetScraper( (browsers[b], version) )
    if not scraper:
      raise ValueError("could not find scraper for (%r, %r)" %
        (browsers[b], version))
    scraper_list.append(scraper)

  if command["--url"]:
    url_list = [command["--url"]]
  else:
    startline = command["--startline"]
    if command["--count"]:
      endline = startline+command["--count"]
    else:
      endline = command["--endline"]
    url_list = [url.strip() for url in
                open(command["--list"], "r").readlines()[startline:endline]]

  log_file = open(command["--logfile"], "w")

  log_file.write("URL")
  for b in xrange(num_browsers):
    log_file.write(",%s" % browsers[b])

    if browser_versions[b]: log_file.write(" %s" % browser_versions[b])
  log_file.write("\n")

  results = {}
  for url in url_list:
    results[url] = [None] * num_browsers

  for b in xrange(num_browsers):
    result = scraper_list[b].Time(url_list, command["--size"],
      command["--timeout"],
      path=browser_paths[b])

    for (url, time) in result:
      results[url][b] = time

  # output the results
  for url in url_list:
    log_file.write(url)
    for b in xrange(num_browsers):
      log_file.write(",%r" % results[url][b])