def Scrape(browsers, urls, window_size=(1024, 768), window_pos=(0, 0), timeout=20, save_path=None, **kwargs): """Invoke one or more browsers over one or more URLs, scraping renders. Args: browsers: browsers to invoke with optional version strings urls: URLs to visit window_size: size of the browser window to display window_pos: location of browser window timeout: time (in seconds) to wait for page to load save_path: root of save path, automatically appended with browser and version kwargs: miscellaneous keyword args, passed to scraper Returns: None @TODO(jhaas): more parameters, or perhaps an indefinite dictionary parameter, for things like length of time to wait for timeout, speed of mouse clicks, etc. Possibly on a per-browser, per-URL, or per-browser-per-URL basis """ if type(browsers) in types.StringTypes: browsers = [browsers] if save_path is None: # default save path is "scrapes" off the current root save_path = os.path.join(os.path.split(__file__)[0], "Scrapes") for browser in browsers: # Browsers should be tuples of (browser, version) if type(browser) in types.StringTypes: browser = (browser, None) scraper = scrapers.GetScraper(browser) full_path = os.path.join(save_path, browser[0], scraper.version) drivers.windowing.PreparePath(full_path) scraper.Scrape(urls, full_path, window_size, window_pos, timeout, kwargs)
def ExecuteMaskmaker(command): """Performs automatic mask generation.""" # Get the list of URLs to generate masks for class MaskmakerURL(object): """Helper class for holding information about a URL passed to maskmaker.""" __slots__ = ['url', 'consecutive_successes', 'errors'] def __init__(self, url): self.url = url self.consecutive_successes = 0 self.errors = 0 if command["--url"]: url_list = [MaskmakerURL(command["--url"])] else: startline = command["--startline"] if command["--count"]: endline = startline+command["--count"] else: endline = command["--endline"] url_list = [MaskmakerURL(url.strip()) for url in open(command["--list"], "r").readlines()[startline:endline]] complete_list = [] error_list = [] outdir = command["--outdir"] scrapes = command["--scrapes"] errors = command["--errors"] size = command["--size"] scrape_pass = 0 scrapedir = command["--scrapedir"] if not scrapedir: scrapedir = tempfile.gettempdir() # Get the scraper scraper = scrapers.GetScraper((command["--browser"], command["--browserver"])) # Repeatedly iterate through the list of URLs until either every URL has # a successful mask or too many errors, or we've exceeded the giveup limit while url_list and scrape_pass < command["--giveup"]: # Scrape each URL for url in url_list: print "Processing %r..." % url.url mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp") # Load the existing mask. This is in a loop so we can try to recover # from error conditions while True: try: mask = Image.open(mask_filename) if mask.size != size: print " %r already exists and is the wrong size! (%r vs %r)" % ( mask_filename, mask.size, size) mask_filename = "%s_%r%s" % ( mask_filename[:-4], size, mask_filename[-4:]) print " Trying again as %r..." % mask_filename continue break except IOError: print " %r does not exist, creating" % mask_filename mask = Image.new("1", size, 1) mask.save(mask_filename) # Find the stored scrape path mask_scrape_dir = os.path.join( scrapedir, os.path.splitext(os.path.basename(mask_filename))[0]) drivers.windowing.PreparePath(mask_scrape_dir) # Find the baseline image mask_scrapes = os.listdir(mask_scrape_dir) mask_scrapes.sort() if not mask_scrapes: print " No baseline image found, mask will not be updated" baseline = None else: baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0])) mask_scrape_filename = os.path.join(mask_scrape_dir, time.strftime("%y%m%d-%H%M%S.bmp")) # Do the scrape result = scraper.Scrape( [url.url], mask_scrape_dir, size, (0, 0), command["--timeout"], path=command["--browserpath"], filename=mask_scrape_filename) if result: # Return value other than None means an error print " Scrape failed with error '%r'" % result url.errors += 1 if url.errors >= errors: print " ** Exceeded maximum error count for this URL, giving up" continue # Load the new scrape scrape = Image.open(mask_scrape_filename) # Calculate the difference between the new scrape and the baseline, # subject to the current mask if baseline: diff = ImageChops.multiply(ImageChops.difference(scrape, baseline), mask.convert(scrape.mode)) # If the difference is none, there's nothing to update if max(diff.getextrema()) == (0, 0): print " Scrape identical to baseline, no change in mask" url.consecutive_successes += 1 if url.consecutive_successes >= scrapes: print " ** No change for %r scrapes, done!" % scrapes else: # convert the difference to black and white, then change all # black pixels (where the scrape and the baseline were identical) # to white, all others (where the scrape and the baseline differed) # to black. # # Since the below command is a little unclear, here's how it works. # 1. convert("L") converts the RGB image to grayscale # 2. point() maps grayscale values (or the individual channels) # of an RGB image) to different ones. Because it operates on # individual channels, the grayscale conversion from step 1 # is necessary. # 3. The "1" second parameter to point() outputs the result as # a monochrome bitmap. If the original RGB image were converted # directly to monochrome, PIL would dither it. diff = diff.convert("L").point([255]+[0]*255, "1") # count the number of different pixels diff_pixels = diff.getcolors()[0][0] # is this too much? diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1]) if diff_pixel_percent > command["--threshhold"]: print (" Scrape differed from baseline by %.2f percent, ignoring" % diff_pixel_percent) else: print " Scrape differed in %d pixels, updating mask" % diff_pixels mask = ImageChops.multiply(mask, diff) mask.save(mask_filename) # reset the number of consecutive "good" scrapes url.consecutive_successes = 0 # Remove URLs whose mask is deemed done complete_list.extend( [url for url in url_list if url.consecutive_successes >= scrapes]) error_list.extend( [url for url in url_list if url.errors >= errors]) url_list = [ url for url in url_list if url.consecutive_successes < scrapes and url.errors < errors] scrape_pass += 1 print "**Done with scrape pass %d\n" % scrape_pass if scrape_pass >= command["--giveup"]: print "**Exceeded giveup threshhold. Giving up." else: print "Waiting %d seconds..." % command["--wait"] time.sleep(command["--wait"]) print print "*** MASKMAKER COMPLETE ***" print "Summary report:" print " %d masks successfully generated" % len(complete_list) for url in complete_list: print " ", url.url print " %d masks failed with too many errors" % len(error_list) for url in error_list: print " ", url.url if scrape_pass >= command["--giveup"]: print (" %d masks were not completed before " "reaching the giveup threshhold" % len(url_list)) for url in url_list: print " ", url.url
def Iterate(command, iteration_func): """Iterates over a list of URLs, calling a function on each. Args: command: the command line containing the iteration flags iteration_func: called for each URL with (proc, wnd, url, result) """ # Retrieve the browser scraper to use to invoke the browser scraper = scrapers.GetScraper( (command["--browser"], command["--browserver"])) def AttachToBrowser(path, timeout): """Invoke the browser process and connect to the socket.""" (proc, frame, wnd) = scraper.GetBrowser(path) if not wnd: raise ValueError("Could not invoke browser.") # Try to connect the socket. If it fails, wait and try # again. Do this for ten seconds s = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) for attempt in xrange(10): try: s.connect(("localhost", PORT)) except socket.error: time.sleep(1) continue break try: s.getpeername() except socket.error: raise ValueError("Could not connect to browser") if command["--size"]: # Resize and reposition the frame windowing.MoveAndSizeWindow(frame, (0, 0), command["--size"], wnd) s.settimeout(timeout) Iterate.proc = proc Iterate.wnd = wnd Iterate.s = s def DetachFromBrowser(): """Close the socket and kill the process if necessary.""" if Iterate.s: Iterate.s.close() Iterate.s = None if Iterate.proc: if not windowing.WaitForProcessExit(Iterate.proc, 0): try: windowing.EndProcess(Iterate.proc) windowing.WaitForProcessExit(Iterate.proc, 0) except pywintypes.error: # Exception here most likely means the process died on its own pass Iterate.proc = None if command["--browserpath"]: browser = command["--browserpath"] else: browser = None # Read the URLs from the file if command["--url"]: url_list = [command["--url"]] else: startline = command["--startline"] if command["--count"]: endline = startline + command["--count"] else: endline = command["--endline"] url_list = [] file = open(command["--list"], "r") for line in xrange(startline - 1): file.readline() for line in xrange(endline - startline): url_list.append(file.readline().strip()) timeout = command["--timeout"] # Loop through the URLs and send them through the socket Iterate.s = None Iterate.proc = None Iterate.wnd = None for url in url_list: # Invoke the browser if necessary if not Iterate.proc: AttachToBrowser(browser, timeout) # Send the URL and wait for a response Iterate.s.send(url + "\n") response = "" while (response.find("\n") < 0): try: recv = Iterate.s.recv(MAX_URL) response = response + recv # Workaround for an oddity: when Firefox closes # gracefully, somehow Python doesn't detect it. # (Telnet does) if not recv: raise socket.error except socket.timeout: response = url + ",hang\n" DetachFromBrowser() except socket.error: # If there was a socket error, it's probably a crash response = url + ",crash\n" DetachFromBrowser() # If we received a timeout response, restart the browser if response[-9:] == ",timeout\n": DetachFromBrowser() # Invoke the iteration function iteration_func(url, Iterate.proc, Iterate.wnd, response) # We're done DetachFromBrowser()
def ExecuteCompare2(command): """Executes the Compare2 command.""" if command["--url"]: url_list = [command["--url"]] else: startline = command["--startline"] if command["--count"]: endline = startline+command["--count"] else: endline = command["--endline"] url_list = [url.strip() for url in open(command["--list"], "r").readlines()[startline:endline]] log_file = open(command["--logfile"], "w") outdir = command["--outdir"] if not outdir: outdir = tempfile.gettempdir() scrape_info_list = [] class ScrapeInfo(object): """Helper class to hold information about a scrape.""" __slots__ = ["browser_path", "scraper", "outdir", "result"] for index in xrange(1, 3): scrape_info = ScrapeInfo() scrape_info.browser_path = command["--browser%d" % index] scrape_info.scraper = scrapers.GetScraper( (command["--browser"], command["--browser%dver" % index])) if command["--browser%dname" % index]: scrape_info.outdir = os.path.join(outdir, command["--browser%dname" % index]) else: scrape_info.outdir = os.path.join(outdir, str(index)) drivers.windowing.PreparePath(scrape_info.outdir) scrape_info_list.append(scrape_info) compare = operators.GetOperator("equals_with_mask") for url in url_list: success = True for scrape_info in scrape_info_list: scrape_info.result = scrape_info.scraper.Scrape( [url], scrape_info.outdir, command["--size"], (0, 0), command["--timeout"], path=scrape_info.browser_path) if not scrape_info.result: scrape_info.result = "success" else: success = False result = "unknown" if success: result = "equal" file1 = drivers.windowing.URLtoFilename( url, scrape_info_list[0].outdir, ".bmp") file2 = drivers.windowing.URLtoFilename( url, scrape_info_list[1].outdir, ".bmp") comparison_result = compare.Compare(file1, file2, maskdir=command["--maskdir"]) if comparison_result is not None: result = "not-equal" if command["--diffdir"]: comparison_result[1].save( drivers.windowing.URLtoFilename(url, command["--diffdir"], ".bmp")) # TODO(jhaas): maybe use the logging module rather than raw file writes log_file.write("%s %s %s %s\n" % (url, scrape_info_list[0].result, scrape_info_list[1].result, result))
def ExecuteTimeLoad(command): """Executes the TimeLoad command.""" browsers = command["--browsers"].split(",") num_browsers = len(browsers) if command["--browserversions"]: browser_versions = command["--browserversions"].split(",") else: browser_versions = [None] * num_browsers if command["--browserpaths"]: browser_paths = command["--browserpaths"].split(",") else: browser_paths = [None] * num_browsers if len(browser_versions) != num_browsers: raise ValueError( "--browserversions must be same length as --browser_paths") if len(browser_paths) != num_browsers: raise ValueError( "--browserversions must be same length as --browser_paths") if [b for b in browsers if b not in ["chrome", "ie", "firefox"]]: raise ValueError("unknown browsers: %r" % b) scraper_list = [] for b in xrange(num_browsers): version = browser_versions[b] if not version: version = None scraper = scrapers.GetScraper( (browsers[b], version) ) if not scraper: raise ValueError("could not find scraper for (%r, %r)" % (browsers[b], version)) scraper_list.append(scraper) if command["--url"]: url_list = [command["--url"]] else: startline = command["--startline"] if command["--count"]: endline = startline+command["--count"] else: endline = command["--endline"] url_list = [url.strip() for url in open(command["--list"], "r").readlines()[startline:endline]] log_file = open(command["--logfile"], "w") log_file.write("URL") for b in xrange(num_browsers): log_file.write(",%s" % browsers[b]) if browser_versions[b]: log_file.write(" %s" % browser_versions[b]) log_file.write("\n") results = {} for url in url_list: results[url] = [None] * num_browsers for b in xrange(num_browsers): result = scraper_list[b].Time(url_list, command["--size"], command["--timeout"], path=browser_paths[b]) for (url, time) in result: results[url][b] = time # output the results for url in url_list: log_file.write(url) for b in xrange(num_browsers): log_file.write(",%r" % results[url][b])