def AttachToBrowser(path, timeout): """Invoke the browser process and connect to the socket.""" (proc, frame, wnd) = scraper.GetBrowser(path) if not wnd: raise ValueError("Could not invoke browser.") # Try to connect the socket. If it fails, wait and try # again. Do this for ten seconds s = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) for attempt in xrange(10): try: s.connect(("localhost", PORT)) except socket.error: time.sleep(1) continue break try: s.getpeername() except socket.error: raise ValueError("Could not connect to browser") if command["--size"]: # Resize and reposition the frame windowing.MoveAndSizeWindow(frame, (0, 0), command["--size"], wnd) s.settimeout(timeout) Iterate.proc = proc Iterate.wnd = wnd Iterate.s = s
def Scrape(urls, outdir, size, pos, timeout=20, **kwargs): """Invoke a browser, send it to a series of URLs, and save its output. Args: urls: list of URLs to scrape outdir: directory to place output size: size of browser window to use pos: position of browser window timeout: amount of time to wait for page to load kwargs: miscellaneous keyword args Returns: None if success, else an error string """ path = r"c:\program files\internet explorer\iexplore.exe" if "path" in kwargs and kwargs["path"]: path = kwargs["path"] (iewnd, ieproc, address_bar, render_pane, tab_window) = (InvokeBrowser(path)) # Resize and reposition the frame windowing.MoveAndSizeWindow(iewnd, pos, size, render_pane) # Visit each URL we're given if type(urls) in types.StringTypes: urls = [urls] timedout = False for url in urls: # Double-click in the address bar, type the name, and press Enter mouse.DoubleClickInWindow(address_bar) keyboard.TypeString(url) keyboard.TypeString("\n") # Wait for the page to finish loading load_time = windowing.WaitForThrobber(tab_window, (6, 8, 22, 24), timeout) timedout = load_time < 0 if timedout: break # Scrape the page image = windowing.ScrapeWindow(render_pane) # Save to disk if "filename" in kwargs: if callable(kwargs["filename"]): filename = kwargs["filename"](url) else: filename = kwargs["filename"] else: filename = windowing.URLtoFilename(url, outdir, ".bmp") image.save(filename) windowing.EndProcess(ieproc) if timedout: return "timeout"
def Time(urls, size, timeout, **kwargs): """Measure how long it takes to load each of a series of URLs Args: urls: list of URLs to time size: size of browser window to use timeout: amount of time to wait for page to load kwargs: miscellaneous keyword args Returns: A list of tuples (url, time). "time" can be "crashed" or "timeout" """ if "path" in kwargs and kwargs["path"]: path = kwargs["path"] else: path = DEFAULT_PATH proc = None # Visit each URL we're given if type(urls) in types.StringTypes: urls = [urls] ret = [] for url in urls: try: # Invoke the browser if necessary if not proc: (wnd, proc, address_bar, render_pane, tab_window) = InvokeBrowser(path) # Resize and reposition the frame windowing.MoveAndSizeWindow(wnd, (0, 0), size, render_pane) # Double-click in the address bar, type the name, and press Enter mouse.DoubleClickInWindow(address_bar) keyboard.TypeString(url) keyboard.TypeString("\n") # Wait for the page to finish loading load_time = windowing.WaitForThrobber(tab_window, (6, 8, 22, 24), timeout) timedout = load_time < 0 if timedout: load_time = "timeout" # Send an alt-F4 to make the browser close; if this times out, # we've probably got a crash keyboard.TypeString(r"{\4}", use_modifiers=True) if not windowing.WaitForProcessExit(proc, timeout): windowing.EndProcess(proc) load_time = "crashed" proc = None except pywintypes.error: load_time = "crashed" proc = None ret.append((url, load_time)) # Send an alt-F4 to make the browser close; if this times out, # we've probably got a crash if proc: keyboard.TypeString(r"{\4}", use_modifiers=True) if not windowing.WaitForProcessExit(proc, timeout): windowing.EndProcess(proc) return ret
def Scrape(urls, outdir, size, pos, timeout=20, **kwargs): """Invoke a browser, send it to a series of URLs, and save its output. Args: urls: list of URLs to scrape outdir: directory to place output size: size of browser window to use pos: position of browser window timeout: amount of time to wait for page to load kwargs: miscellaneous keyword args Returns: None if success, else an error string """ if "path" in kwargs and kwargs["path"]: path = kwargs["path"] else: path = DEFAULT_PATH (wnd, proc, render_pane) = InvokeBrowser(path) # Resize and reposition the frame windowing.MoveAndSizeWindow(wnd, pos, size, render_pane) time.sleep(3) # Firefox is a bit of a pain: it doesn't use standard edit controls, # and it doesn't display a throbber when there's no tab. Let's make # sure there's at least one tab, then select the first one mouse.ClickInWindow(wnd) keyboard.TypeString("[t]", True) mouse.ClickInWindow(wnd, (30, 115)) time.sleep(2) timedout = False # Visit each URL we're given if type(urls) in types.StringTypes: urls = [urls] for url in urls: # Use keyboard shortcuts keyboard.TypeString("{d}", True) keyboard.TypeString(url) keyboard.TypeString("\n") # Wait for the page to finish loading load_time = windowing.WaitForThrobber(wnd, (10, 96, 26, 112), timeout) timedout = load_time < 0 if timedout: break # Scrape the page image = windowing.ScrapeWindow(render_pane) # Save to disk if "filename" in kwargs: if callable(kwargs["filename"]): filename = kwargs["filename"](url) else: filename = kwargs["filename"] else: filename = windowing.URLtoFilename(url, outdir, ".bmp") image.save(filename) # Close all the tabs, cheesily mouse.ClickInWindow(wnd) while len(windowing.FindChildWindows(0, "MozillaUIWindowClass")): keyboard.TypeString("[w]", True) time.sleep(1) if timedout: return "timeout"
def Time(urls, size, timeout, **kwargs): """Measure how long it takes to load each of a series of URLs Args: urls: list of URLs to time size: size of browser window to use timeout: amount of time to wait for page to load kwargs: miscellaneous keyword args Returns: A list of tuples (url, time). "time" can be "crashed" or "timeout" """ if "path" in kwargs and kwargs["path"]: path = kwargs["path"] else: path = DEFAULT_PATH proc = None # Visit each URL we're given if type(urls) in types.StringTypes: urls = [urls] ret = [] for url in urls: try: # Invoke the browser if necessary if not proc: (wnd, proc, render_pane) = InvokeBrowser(path) # Resize and reposition the frame windowing.MoveAndSizeWindow(wnd, (0, 0), size, render_pane) time.sleep(3) # Firefox is a bit of a pain: it doesn't use standard edit controls, # and it doesn't display a throbber when there's no tab. Let's make # sure there's at least one tab, then select the first one mouse.ClickInWindow(wnd) keyboard.TypeString("[t]", True) mouse.ClickInWindow(wnd, (30, 115)) time.sleep(2) # Use keyboard shortcuts keyboard.TypeString("{d}", True) keyboard.TypeString(url) keyboard.TypeString("\n") # Wait for the page to finish loading load_time = windowing.WaitForThrobber(wnd, (10, 96, 26, 112), timeout) timedout = load_time < 0 if timedout: load_time = "timeout" # Try to close the browser; if this fails it's probably a crash mouse.ClickInWindow(wnd) count = 0 while (len( windowing.FindChildWindows(0, "MozillaUIWindowClass")) and count < 5): keyboard.TypeString("[w]", True) time.sleep(1) count = count + 1 if len(windowing.FindChildWindows(0, "MozillaUIWindowClass")): windowing.EndProcess(proc) load_time = "crashed" proc = None except pywintypes.error: proc = None load_time = "crashed" ret.append((url, load_time)) if proc: count = 0 while (len(windowing.FindChildWindows(0, "MozillaUIWindowClass")) and count < 5): keyboard.TypeString("[w]", True) time.sleep(1) count = count + 1 return ret
def Scrape(urls, outdir, size, pos, timeout, kwargs): """Invoke a browser, send it to a series of URLs, and save its output. Args: urls: list of URLs to scrape outdir: directory to place output size: size of browser window to use pos: position of browser window timeout: amount of time to wait for page to load kwargs: miscellaneous keyword args Returns: None if success, else an error string """ if "path" in kwargs and kwargs["path"]: path = kwargs["path"] else: path = DEFAULT_PATH (wnd, proc, address_bar, render_pane) = InvokeBrowser(path) # Resize and reposition the frame windowing.MoveAndSizeWindow(wnd, pos, size, render_pane) # Visit each URL we're given if type(urls) in types.StringTypes: urls = [urls] timedout = False for url in urls: # Double-click in the address bar, type the name, and press Enter mouse.ClickInWindow(address_bar) keyboard.TypeString(url, 0.1) keyboard.TypeString("\n") # Wait for the page to finish loading load_time = windowing.WaitForThrobber(wnd, (20, 16, 36, 32), timeout) timedout = load_time < 0 if timedout: break # Scrape the page image = windowing.ScrapeWindow(render_pane) # Save to disk if "filename" in kwargs: if callable(kwargs["filename"]): filename = kwargs["filename"](url) else: filename = kwargs["filename"] else: filename = windowing.URLtoFilename(url, outdir, ".bmp") image.save(filename) if proc: windowing.SetForegroundWindow(wnd) # Send Alt-F4, then wait for process to end keyboard.TypeString(r"{\4}", use_modifiers=True) if not windowing.WaitForProcessExit(proc, timeout): windowing.EndProcess(proc) return "crashed" if timedout: return "timeout" return None