def grab_url_screenshot(url): """ Grab an url making a screenshot of it Filename is SHA256 of url :param url: :return: """ ret = None try: # Bonifica url (se manca lo schema assumo http://) url_res = urlparse(url) if not url_res.scheme: url = "http://" + url # TODO: Può essere un singleton Ghost? ghost = Ghost() page, res = ghost.open(url) if not page is None and page.http_status == 200: url_sha256 = hashlib.sha256(url).hexdigest() image_path = os.path.join('url_previews', url_sha256 + ".png") full_path = os.path.join(settings.MEDIA_ROOT, image_path) ghost.capture_to(full_path) image_path = image_path.replace(".png", ".thumb.png") thumb_full_path = os.path.join(settings.MEDIA_ROOT,image_path) resize_and_crop(full_path, thumb_full_path, (550, 500)) ret = urljoin(settings.BASE_URL, "uploads/" + image_path) else: logger.error("Failed to capture screenshot for {0}".format(url)) except Exception, e: logger.exception(e)
def main(argv): if len(argv) < 2: sys.stderr.write("Usage: %s <url>\n" % (argv[0], )) return 1 target = argv[1] warc_name = "flashfrozen" print("Starting Ghost.py...") ghost = Ghost(viewport_size=(1280, 1024), wait_timeout=100) # ghost = Ghost(viewport_size=(1280, 1024), display=True) # ghost.webview.getSettings().setPluginsEnabled(true); print("Loading page:", target) page, resources = ghost.open(target) #time.sleep(2) print("Taking screenshot...") ghost.capture_to('original-screenshot.png') print("Shutting down Ghost.py...") ghost.exit() # Extract a list of resource URLs print("Extracting URLs...") urls = set() for r in resources: pprint.pprint(r.url) print(dir(r.url)) urls.add(str(r.url)) if target not in urls: urls.add(target) # Open pipe to the wget process print("Passing URLs to wget...") process = subprocess.Popen([ "wget", "-q", "-i", "-", "-O", "-", "--warc-file={}".format(warc_name) ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) # Also open a file to hold a note of the URLs urlf = open('original-urls.txt', 'w') # Pass in the URLs, via STDIN: for u in urls: print("GOT", u) urlf.write("{}\n".format(u)) #process.stdin.write(bytes("{}\n".format(u),'UTF-8')) process.stdin.write("{}\n".format(u)) #process.stdin.write(u) # Close the URLs file: urlf.close() # Close STDIN so wget knows there are no more URLs coming: process.stdin.flush() process.stdin.close() # This explicitly churns through and ignores STDOUT: print("Waiting for wget output...") for line in process.stdout: pass # Wait for the process to finish: print("Waiting for wget to finish...") process.wait() print("Done.")
def create_screenshot(url, filename, size): ghost = Ghost().start() ghost.set_viewport_size(size[0], size[1]) ghost.open(url) ghost.wait_for_alert(timeout=40) ghost.wait_for_page_loaded(timeout=40) ghost.capture_to(filename, selector='#map')
def index(request): dir_name = os.path.dirname(__file__) img_name = os.path.join(dir_name, "pic.jpg") if request.method == "POST": url = request.POST.get("url", "") else: url = request.GET.get("url", "") if not url: url = "http://www.bbc.uk.com" display = Display() display.start() ghost = Ghost() ghost.open(url) width = int(ghost.evaluate("document.body.clientWidth")[0]) height = int(ghost.evaluate("document.body.clientHeight")[0]) ghost = Ghost(viewport_size=(width, height)) ghost.open(url) ghost.capture_to(img_name, selector="body") image = Image.open(img_name) image.thumbnail((128, 128), Image.ANTIALIAS) response = HttpResponse(mimetype="image/jpeg") image.save(response, "jpeg") display.stop() return response
def main(argv): if len(argv) < 2: sys.stderr.write("Usage: %s <url>\n" % (argv[0],)) return 1 target = argv[1] warc_name = "flashfrozen" print("Starting Ghost.py...") ghost = Ghost(viewport_size=(1280, 1024),wait_timeout=100) # ghost = Ghost(viewport_size=(1280, 1024), display=True) # ghost.webview.getSettings().setPluginsEnabled(true); print("Loading page:",target) page, resources = ghost.open(target) #time.sleep(2) print("Taking screenshot...") ghost.capture_to('original-screenshot.png') print("Shutting down Ghost.py...") ghost.exit() # Extract a list of resource URLs print("Extracting URLs...") urls = set() for r in resources: pprint.pprint(r.url) print(dir(r.url)) urls.add(str(r.url)) if target not in urls: urls.add(target) # Open pipe to the wget process print("Passing URLs to wget...") process = subprocess.Popen(["wget", "-q", "-i", "-", "-O", "-", "--warc-file={}".format(warc_name)] ,stdin=subprocess.PIPE,stdout=subprocess.PIPE) # Also open a file to hold a note of the URLs urlf = open('original-urls.txt','w') # Pass in the URLs, via STDIN: for u in urls: print("GOT", u) urlf.write("{}\n".format(u)) #process.stdin.write(bytes("{}\n".format(u),'UTF-8')) process.stdin.write("{}\n".format(u)) #process.stdin.write(u) # Close the URLs file: urlf.close() # Close STDIN so wget knows there are no more URLs coming: process.stdin.flush() process.stdin.close() # This explicitly churns through and ignores STDOUT: print("Waiting for wget output...") for line in process.stdout: pass # Wait for the process to finish: print("Waiting for wget to finish...") process.wait() print("Done.")
class Browser(): def __init__(self,width=1024,height=800): self.browser = Ghost() self.browser.set_viewport_size(width,height) def take_screenshot(self, url, path_to_picture): self.browser.open(url) self.browser.capture_to(path_to_picture, selector='.noncollapsed')
def get_disqus_comments_by_ghost(dictionary): '''Uses Ghost to trigger url for iframe, then requests to fetch that html, and finally get the json from failed-page from disqus.. ''' ghost = Ghost( user_agent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', viewport_size=(1349, 765), log_level=logging.ERROR) # user_agent='Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7', viewport_size = (320, 480) page, resources = ghost.open(dictionary['url']) assert page.http_status == 200 # make use we get data back.. ghost.wait_for_page_loaded() # probably does no harm.. # comment loaded on scroll hack cred goes to Hammer et al. (2013) secs = 0.50 time.sleep(secs) ghost.evaluate("window.scroll(0, 700);") ghost.capture_to( 'scroll_before.png' ) # do not get why this fails if i remove this image-capture function... time.sleep(secs) ghost.evaluate("window.scroll(0, 1400);") time.sleep(secs) ghost.evaluate("window.scroll(0, 2100);") time.sleep(secs) ghost.evaluate("window.scroll(0, 4000);") time.sleep(secs) ghost.wait_for_page_loaded() #ghost.capture_to('scroll_after.png') logger.info("waiting for selector IFRAME") ghost.wait_for_selector("iframe") ##post-list # print ghost.content soup = BeautifulSoup(ghost.content) try: comments_iframe_url = soup.select("iframe#dsq-2")[0][ 'src'] # only one of these... # headers = { # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', # 'X-UA-Compatible': 'IE=edge' # } comments_html = requests.get(comments_iframe_url) # , headers=headers #print comments_html.text iframe_soup = BeautifulSoup(comments_html.text) posts = iframe_soup.select("#disqus-threadData") data = json.loads(posts[0].text) #print type(data) return data['response']['thread']['posts'] except: # fetching comments failed return -9999
def main(argv): if len(argv) < 2: sys.stderr.write("Usage: %s <url>\n" % (argv[0],)) return 1 ghost = Ghost(viewport_size=(1280, 1024)) page, resources = ghost.open(argv[1]) assert page.http_status==200 and 'bbc' in ghost.content ghost.capture_to('screenshot2.png') for r in resources: print r.url
def screenshot(url,target): ghost = Ghost(wait_timeout=4) print "Do u want to provide any credentials \n" choice=raw_input() if choice.lower()=='y': print colored("[-] Enter Username and Password ",'green') username=raw_input() password=raw_input() ghost.open(url,auth=(username,password)) ghost.capture_to(str(time.time())+'.png') os.system('mv *.png ./screenshots') print colored("[-] Screenshot Succesfull catpured and Saved @ %s/screen.png"%(os.getcwd()),'green') else: pass
def site_capture(): print(sys.argv[1]) print(sys.argv[2]) try: g = Ghost() r = g.open(sys.argv[1]) g.capture_to(sys.argv[2]) g.exit() print('success') except Exception, e: print('fail') pass
def render_notext(url): sitename = get_sitename(url) ghost = Ghost() ghost.open(url, headers={ "Accept":"image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", }) ghost.wait_for_page_loaded() #time.sleep(10) ghost.capture_to(os.path.join(path,sitename +'.png'),(0,0,1280,700)) return sitename print "success"
def ghostCapture(screen_url,web_timeout=20): import time gh = Ghost(display=":99",wait_timeout=int(web_timeout), viewport_size=(x2,y2), ignore_ssl_errors=True, log_level=logging.FATAL) ghost_page,resources= gh.open(screen_url, auth=('none', 'none')) img_path=mkstemp(suffix=".png")[1] gh.set_viewport_size(x2,y2) time.sleep(timeout) gh.capture_to(img_path,region=(x1,y1,x2,y2)) return img_path
def make_pages(pn, c): #global ghost ghost = Ghost(download_images = True) #time.sleep( 1 ) root = app_root + '/cache' file = "page_" + pn + "_" + str(c) + ".jpg" markup = get_markup(pn, True) ghost.main_frame.setHtml(markup) ghost.wait_for_page_loaded() ghost.capture_to(root + '/' + file)
def main(argv): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('url', help='URL to render and download') parser.add_argument('-d', action='store_true', help='Download WARC, save urls, and save screenshot to default names') parser.add_argument('--warc', help='Uses wget to save the page to a WARC file') parser.add_argument('--save-urls', type=argparse.FileType('w'), help='Save the urls to a file but do not run wget') parser.add_argument('--screenshot', help='Save a screenshot of the website') parser.add_argument('--display', action='store_true', help='Display the webkit window while downloading') args = parser.parse_args(argv[1:]) target = args.url if args.d is True: args.screenshot = 'original-screenshot.png' args.warc = 'flashfrozen' args.save_urls = open('original-urls.txt', 'wb') print "Starting Ghost.py..." ghost = Ghost(viewport_size=(1280, 1024), display=args.display, wait_timeout=90) # ghost.webview.getSettings().setPluginsEnabled(true); print "Loading page:", target page, resources = ghost.open(target) #time.sleep(2) if args.screenshot is not None: print "Taking screenshot..." ghost.capture_to(args.screenshot) print "Shutting down Ghost.py..." ghost.exit() urls = urls_from_ghost_resources(resources) urls.add(target) if args.save_urls is not None: for u in urls: args.save_urls.write("{}\n".format(u)) args.save_urls.close() if args.warc is not None: print "Passing URLs to wget..." run_wget(urls, args.warc) print "Done."
class Shotter(object): def __init__(self,): try: #self.ghost = Ghost(wait_timeout=10) self.ghost = Ghost() except ImportError: self.ghost = None def cuty_shot(self, url, filename, x11=True, width=1024, height=768, colorbits=24): #TODO: add check if cutycapt installed if x11: cmd = 'cutycapt --url="%s" --out=%s' % (url, filename) else: cmd = 'xvfb-run --server-args="-screen 0, %ix%ix%i" cutycapt --url="%s" --out=%s' % \ (url, filename, width, height, colorbits) try: Popen(cmd, shell=True).wait() return True except: return False def ghost_shot(self, url, filename, ignore_errors=True): #print('ghost_shot(%s)' % url) try: page, resources = self.ghost.open(url) if ignore_errors: self.ghost.capture_to(filename) return True elif page.http_status == 200 and page.totalBytes() != 0: self.ghost.capture_to(filename) return True else: return False except: print(exc_info()) return False def screenshot(self, url, filename, overwrite=False): if path.exists(filename) and not overwrite: print('%s exists, skipping' % filename) return print('[SCREENSHOT] %s -> %s' % (url, filename)) if self.ghost is not None: self.ghost_shot(url, filename) else: self.cuty_shot(url, filename)
def screenshot(url, target): ghost = Ghost(wait_timeout=4) print "Do u want to provide any credentials \n" choice = raw_input() if choice.lower() == 'y': print colored("[-] Enter Username and Password ", 'green') username = raw_input() password = raw_input() ghost.open(url, auth=(username, password)) ghost.capture_to(str(time.time()) + '.png') os.system('mv *.png ./screenshots') print colored( "[-] Screenshot Succesfull catpured and Saved @ %s/screen.png" % (os.getcwd()), 'green') else: pass
def snap(conn, url, cookie_name, cookie_value, width, height, loaded, hides, selector): """Handle all the work of taking the page snapshot. The first parameter is a connection object for a `multiprocessing.Pipe` that we use to send back the file name written to. The remaining parameters are `multiprocessing.Value`s. """ ghost = None try: ghost = Ghost(viewport_size=(width.value, height.value)) ghost.wait_timeout = 20 headers = {} if cookie_name.value and cookie_value.value: headers = { 'Cookie': str('%s=%s' % (cookie_name.value, cookie_value.value)), } ghost.open(url.value, headers=headers) if loaded.value: try: ghost.wait_for_selector('%s' % loaded.value) except: # if the selector never appears, we don't care pass selectors = hides.value.split(',') if len(selectors): hide_js = r''' if (jQuery) { $(document).ready(function() { %s }); } ''' % '\n'.join([r"$('%s').hide();" % sel for sel in selectors]) ghost.evaluate(hide_js) handle, file_path = mkstemp(prefix='ansel_snap', suffix='.png') ghost.capture_to(file_path, selector=selector.value) conn.send(file_path) finally: del ghost conn.close()
def get_disqus_comments_by_ghost(dictionary): '''Uses Ghost to trigger url for iframe, then requests to fetch that html, and finally get the json from failed-page from disqus.. ''' ghost = Ghost(user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', viewport_size = (1349, 765), log_level=logging.ERROR) # user_agent='Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7', viewport_size = (320, 480) page, resources = ghost.open(dictionary['url']) assert page.http_status == 200 # make use we get data back.. ghost.wait_for_page_loaded() # probably does no harm.. # comment loaded on scroll hack cred goes to Hammer et al. (2013) secs = 0.50 time.sleep(secs) ghost.evaluate("window.scroll(0, 700);") ghost.capture_to('scroll_before.png') # do not get why this fails if i remove this image-capture function... time.sleep(secs) ghost.evaluate("window.scroll(0, 1400);") time.sleep(secs) ghost.evaluate("window.scroll(0, 2100);") time.sleep(secs) ghost.evaluate("window.scroll(0, 4000);") time.sleep(secs) ghost.wait_for_page_loaded() #ghost.capture_to('scroll_after.png') logger.info("waiting for selector IFRAME") ghost.wait_for_selector("iframe") ##post-list # print ghost.content soup = BeautifulSoup(ghost.content) try: comments_iframe_url = soup.select("iframe#dsq-2")[0]['src'] # only one of these... # headers = { # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', # 'X-UA-Compatible': 'IE=edge' # } comments_html = requests.get(comments_iframe_url) # , headers=headers #print comments_html.text iframe_soup = BeautifulSoup(comments_html.text) posts = iframe_soup.select("#disqus-threadData") data = json.loads(posts[0].text) #print type(data) return data['response']['thread']['posts'] except: # fetching comments failed return -9999
def heroku(api_key, year, month): ghost = None file_path = '/tmp/heroku_invoice_%d-%d.png' % (year, month) if os.path.exists(file_path): os.remove(file_path) # TODO: make dimensions configurable? Automatic (is that possible?)? ghost = Ghost(viewport_size=(1000, 1600)) ghost.wait_timeout = 20 ghost.open('https://id.heroku.com/login') ghost.fill("form", dict(email="", password=api_key)) ghost.fire_on("form", "submit", expect_loading=True) ghost.open('https://dashboard.heroku.com/invoices/%s/%s' % (year, month)) ghost.capture_to(file_path) return file_path
def screenshot(url,creds=False): """ Grab Screenshot of Web Interface """ try: ghost = Ghost(wait_timeout=4) if creds: ghost.open(url,auth=(creds.split(':')[0],creds.split(':')[1])) ghost.capture_to(str(time.time())+'.png') os.system('mv *.png ./screenshots') print colored("\n[+] Screenshot Succesfull Catpured and Saved @ %s/screen.png"%(os.getcwd()),'green') else: ghost.open(url) ghost.capture_to('screen'+str(time.time())+'.png') os.system('mv *.png ./screenshots') print colored("\n[+] Screenshot Succesfull Catpured and Saved @ %s/screen.png"%(os.getcwd()),'green') except Exception as e: print colored("\n[-] Screenshot failed Error : "+str(e),'red')
class WxGhost(object): def __init__(self): self.ghost = Ghost(log_level=logging.CRITICAL).start() self.ghost.download_images = False try: self.ghost.load_cookies("cookie.txt") print 'load cookie' except IOError: print 'load cookie error' self.ghost.show() def handle_frequency(self): if u"您的访问过于频繁" in self.ghost.content: print 'frequency' self.ghost.show() self.ghost.capture_to("seccode.png", selector="#seccodeImage") self.ghost.wait_for_text(u'以下内容来自微信公众号', timeout=1800) # 输入验证码 self.ghost.save_cookies("cookie.txt") def open(self, url): try: self.ghost.open(url) self.handle_frequency() except TimeoutError: print 'timeout when open' return False return True def evaluate(self, js, expect_loading=True): try: self.ghost.evaluate(js, expect_loading=expect_loading) self.handle_frequency() except TimeoutError: return False return True def sleep(self, value): self.ghost.sleep(value) def get_lxml(self): return lxml.html.fromstring(self.ghost.content)
class IDirectBroker(object): IDIRECTURL = 'https://secure.icicidirect.com/Trading/LBS/Logon.asp' def __init__(self,username, password, **kwargs): self.username = username self.password = password self.ghost = Ghost() try: self.page, self.resources = self.ghost.open(self.IDIRECTURL) self.ghost.wait_for_page_loaded() self.ghost.capture_to("./l1.png") result, resources = self.ghost.fill("form", { "FML_USR_ID": "MUSE9L71", "FML_USR_USR_PSSWRD": "Infotech@8","FML_USR_DT_BRTH":"22101982" }) self.ghost.capture_to("./l2.png") self.page, self.resources = self.ghost.fire("form", "submit", expect_loading=True) #self.ghost.wait_for_page_loaded() self.ghost.capture_to("./l3.png") except Exception,e: raise e
def RunExport(): ghost = Ghost(viewport_size=(1200, 2400), display=False, wait_timeout=30, cache_dir=CACHE_DIRECTORY)#, log_level=logging.ERROR # #login_password #submit.x #submit page, resources = ghost.open('https://www.paypal.com/ie/cgi-bin/webscr?cmd=_login-run') result, resources = ghost.fill("form[name=login_form]", { "login_email": PAYPAL_USERNAME, "login_password": PAYPAL_PASSWORD }) page, resources = ghost.fire_on("form[name=login_form]", "submit", expect_loading=True) result, resources = ghost.wait_for_page_loaded() #wait for 10 seconds #time.sleep(10) page, resources = ghost.open('https://www.paypal.com/ie/cgi-bin/webscr?cmd=_account') result, resources = ghost.wait_for_text("Welcome, %s" % PAYPAL_NAME) getHistoryListing(ghost) first_run = True #get the next url #print ghost.evaluate('document.querySelectorAll("#tableWrapperID .pagination:nth-child(1) a.btnLink");')[0] nav_links_eval = """ var links = document.querySelectorAll(".pagination a.btnLink"); links.length; """ nav_links = ghost.evaluate(nav_links_eval) page_count = START_AT_PAGE transaction_count = 0 if page_count > 0: transaction_count = page_count * 20 goToPage(ghost,page_count) #transaction_list_url = resources[0].url #print transaction_list_url while nav_links[0] > 0 or first_run==True: first_run = False page_count = page_count + 1 filteredlisting_export = os.path.join(EXPORT_DIRECTORY,'filteredhistory%d.png' % page_count) if not os.path.isfile(filteredlisting_export): ghost.capture_to(filteredlisting_export, selector="body") transaction_urls = ghost.evaluate(""" var links = document.querySelectorAll("#transactionTable tr.primary td.detailsNoPrint a"); var listRet = []; for (var i=0; i<links.length; i++){ listRet.push(links[i].href); } listRet; """) for transaction_href in transaction_urls[0]: transaction_count = transaction_count + 1 #print urllib.unquote(transaction_href) page, resources = ghost.open(urllib.unquote(transaction_href)) ghost.wait_for_page_loaded() payee_name = None date_string = None date = ghost.evaluate(""" document.querySelectorAll("#historyMiniLog tbody tr")[2].querySelectorAll('td')[0].innerHTML; """) if date and date[0]: date_string = date[0].replace(' ','') payee = ghost.evaluate(""" document.querySelectorAll("#historyMiniLog tbody tr")[2].querySelectorAll('td')[1].innerHTML; """) if payee and payee[0]: payee_name = safeFilename(payee[0].replace(' ','')) if payee_name and date_string: date_object = datetime.strptime(date_string, '%d-%b-%Y') date_string=datetime.strftime(date_object,'%Y-%m-%d') print 'page %d transaction %d [%s - %s]' % (page_count, transaction_count, date_string, payee_name) purchasedetails_export = os.path.join(EXPORT_DIRECTORY,'%s_%s_%s.png' % (date_string,payee_name,transaction_count )) if not os.path.isfile(purchasedetails_export): print '\t\tsaving to %s' % purchasedetails_export ghost.capture_to(purchasedetails_export, selector="#xptContentMain") else: print '\t\tAlready saved to %s' % purchasedetails_export else: purchasedetails_export = os.path.join(EXPORT_DIRECTORY,'no date and payee - page-%d_ transaction %d.png' % (page_count,transaction_count )) print '\t\tsaving to %s' % purchasedetails_export if not os.path.isfile(purchasedetails_export): ghost.capture_to(purchasedetails_export, selector="#xptContentMain") else: print '\t\tAlready saved to %s' % purchasedetails_export print 'could not get payee_name and date_string' print '\t\tsaving to %s' % purchasedetails_export getHistoryListing(ghost) goToPage(ghost,page_count) #transaction_list_url = resources[0].url nav_links = ghost.evaluate(nav_links_eval)
def save_screenshot(self, url, path): ghost = Ghost(wait_timeout=120) page, extra = ghost.open(url) ghost.capture_to(path)
def create_png(input, output): ghost = Ghost() page, extra_resources = ghost.open(input) ghost.capture_to(output)
import SimpleHTTPServer import SocketServer import threading from ghost import Ghost def serve_one_request(): """ Start up a simple web server serving files in the current directory, handle 1 request, and shut down again. """ handler = SimpleHTTPServer.SimpleHTTPRequestHandler httpd = SocketServer.TCPServer(("", 8010), handler) httpd.handle_request() # Start webserver in new thread t = threading.Thread(target=serve_one_request) t.daemon = True t.start() # Use Ghost to open HTML page ghost = Ghost() page, extra_resources = ghost.open("http://127.0.0.1:8010/simple_form.html") # Capture a screenshot when the HTML page is ready ghost.wait_for_selector('button') ghost.capture_to('simple_form.png', zoom_factor=3.0)
def fetch_website(url, user_agent, results_location_dir): """function to use for website fetch :param url: url to fetch information from :param user_agent: user agent string that is used by the minion in making the fetch :param results_location_dir: the location to where the results are stored :return: results_data - a dictionary of metadata on the fetch This method uses a different library than the basic fetch method, Ghost.py (documentation at http://ghost-py.readthedocs.io/en/latest/#). After cleaning the url, a session is opened with the user agent string passed in. Then the specific web page is opened and all the resources of the web page are collected. After that, a screen-shot of the web page is collected. Then, the page data is written to a file that is named from the session id. Then each resource gathered during the fetch is written to a file, and these are placed in the same directory as the page data. Beyond that, miscellaneous metadata is written to the results_data dictionary. """ log_debug("fetch_website", "Entering fetch_website") # clean the url url_clean = url.lstrip() log_debug("fetch_website", "Starting Fetch of: " + url_clean) # start a Ghost.py session session = Ghost().start(user_agent=user_agent) results_data = {'requested_url': url, 'actual_url': url_clean, 'remote_job_id': str(session.id)} try: # open the web page and gather all the page's resources page, resources = session.open(address=url_clean, user_agent=user_agent) # catch a TimeoutError except (ghost.TimeoutError, ghost.Error): results_data['connection_success'] = False log_debug("fetch_website", "Connection Failed for Fetch: " + url_clean) return results_data except Exception as e: print type(e) print str(e) return results_data # if page is None and there are no resources, that means that a connection to the page failed if page is None and len(resources) == 0: log_debug("fetch_website", "") results_data['connection_success'] = False else: netloc = urlparse(url_clean).netloc log_debug("fetch_website", "Attempting to capture screenshot of {}".format(netloc)) try: # capture a screen-shot of the web page session.capture_to("{}/{}.png".format(results_location_dir, netloc)) log_debug("fetch_website", "Successful capture of screenshot of {}".format(netloc)) except Exception as e: log_debug("fetch_website", "Failed to capture screenshot of {}".format(netloc)) print type(e) print str(e) try: log_debug("fetch_website", "Opening: {}/{} for: {}".format(results_location_dir, session.id, url_clean)) fetch_file = open("{}/{}".format(results_location_dir, session.id), 'w') log_debug("fetch_website", "writing page content to file") # write page content to file fetch_file.write(page.content) log_debug("fetch_website", "closing {}".format(session.id)) fetch_file.close() # write the data of each resource to different files for resource in resources: log_debug("fetch_website", "opening {}/resource{} for: {}".format(results_location_dir, resources.index(resource), url_clean)) data_file = open("{}/resource{}".format(results_location_dir, resources.index(resource)), "w") log_debug("fetch_website", "writing content to {}".format(resources.index(resource))) data_file.write(resource.content) log_debug("fetch_website", "closing {}".format(resources.index(resource))) data_file.close() results_data['fetch_object_success'] = True except: results_data['fetch_object_success'] = False finally: # collect more metadata results_data['connection_success'] = True results_data['server_info'] = dict(page.headers) results_data['response_code'] = page.http_status if page.http_status in [400, 404, 403, 401]: results_data["fetch_success"] = False if len(session.cookies) > 0: results_data['cookies'] = [x.value().data() for x in session.cookies] return results_data
Usage: python django_admin_login.py <username> <password> (where the username and password are entered into the corresponding boxes) """ import sys from ghost import Ghost import Image username = sys.argv[1] password = sys.argv[2] django_admin_url = "http://127.0.0.1:8000/admin/" image_file = "django_admin_login.png" ghost = Ghost() page, extra_resources = ghost.open(django_admin_url) ghost.wait_for_selector('#id_username') ghost.wait_for_selector('#id_password') ghost.fill("#login-form", {'username': username, 'password': password}) ghost.capture_to(image_file, zoom_factor=3.0) print "Captured django_admin_login.png" # Crop bad space at the top due to the odd page layout im = Image.open(image_file) box = (0, 300, 1014, 1062) region = im.crop(box) region.save(image_file)
from ghost import Ghost url = "http://www.ebay.com/" gh = Ghost() # We load the main page of ebay page, resources = gh.open(url, wait_onload_event=True) # Full the main bar and click on the search button gh.set_field_value("#gh-ac", "plane") gh.click("#gh-btn") # Wait for the next page gh.wait_for_selector("#e1-15") # Save the image of the screen gh.capture_to("plane.png")
import secrets import time from ghost import Ghost ghost = Ghost() page, resources = ghost.open('https://my.ecofactor.com/mobile/login.html') result, resources = ghost.fill('form', { 'j_username': secrets.username, 'j_password': secrets.password }) page, resources = ghost.fire_on('form', 'submit') #page, resources = ghost.fire_on('form', 'submit', expect_loading=True) page, resources = ghost.wait_for_page_loaded() ghost.capture_to("/tmp/foo.png") print page.http_status #print resources
#result, resources = ghost.evaluate("document.getElementById('apDiv2').style.display = 'none';") #result, resources = ghost.evaluate( # "$(document).ready(function() { var target = $('#apDiv2').show(); target = target.add(target.parentsUntil('body')).siblings().hide(); }); ") #ghost.evaluate("window.document.getElementById('apDiv2').style.visibility = 'hidden'; ") #ghost.evaluate("window.document.getElementById('apDiv2').style.visibility = 'hidden'; ") #ghost.evaluate("$('div:not(#myDiv)').show();") #ghost.evaluate("v$('div:not(#myDiv)').show();") #### ghost.evaluate("window.scroll(0, 2000);") #result, resources = ghost.evaluate( # "alert('Okk');") #result, resources = ghost.wait_for_alert() #$(document).ready(function() { # alert("document ready occurred!"); #}); ghost.capture_to(imageName+'.png') #ghost.capture_to('header.png') # assert page.http_status==200 and 'jeanphix' in ghost.content
def take_screenshot_of_url(url, target_filename): ghost = Ghost() page, resources = ghost.open(url) ghost.capture_to(target_filename)
def main(): ghost = Ghost() page, resources = ghost.open('http://google.com') ghost.capture_to('header.png', selector="body")
def RunExport(): ghost = Ghost(viewport_size=(1200, 2400), display=False, wait_timeout=30, cache_dir=CACHE_DIRECTORY) #, log_level=logging.ERROR # #login_password #submit.x #submit page, resources = ghost.open( 'https://www.paypal.com/ie/cgi-bin/webscr?cmd=_login-run') result, resources = ghost.fill("form[name=login_form]", { "login_email": PAYPAL_USERNAME, "login_password": PAYPAL_PASSWORD }) page, resources = ghost.fire_on("form[name=login_form]", "submit", expect_loading=True) result, resources = ghost.wait_for_page_loaded() #wait for 10 seconds #time.sleep(10) page, resources = ghost.open( 'https://www.paypal.com/ie/cgi-bin/webscr?cmd=_account') result, resources = ghost.wait_for_text("Welcome, %s" % PAYPAL_NAME) getHistoryListing(ghost) first_run = True #get the next url #print ghost.evaluate('document.querySelectorAll("#tableWrapperID .pagination:nth-child(1) a.btnLink");')[0] nav_links_eval = """ var links = document.querySelectorAll(".pagination a.btnLink"); links.length; """ nav_links = ghost.evaluate(nav_links_eval) page_count = START_AT_PAGE transaction_count = 0 if page_count > 0: transaction_count = page_count * 20 goToPage(ghost, page_count) #transaction_list_url = resources[0].url #print transaction_list_url while nav_links[0] > 0 or first_run == True: first_run = False page_count = page_count + 1 filteredlisting_export = os.path.join( EXPORT_DIRECTORY, 'filteredhistory%d.png' % page_count) if not os.path.isfile(filteredlisting_export): ghost.capture_to(filteredlisting_export, selector="body") transaction_urls = ghost.evaluate(""" var links = document.querySelectorAll("#transactionTable tr.primary td.detailsNoPrint a"); var listRet = []; for (var i=0; i<links.length; i++){ listRet.push(links[i].href); } listRet; """) for transaction_href in transaction_urls[0]: transaction_count = transaction_count + 1 #print urllib.unquote(transaction_href) page, resources = ghost.open(urllib.unquote(transaction_href)) ghost.wait_for_page_loaded() payee_name = None date_string = None date = ghost.evaluate(""" document.querySelectorAll("#historyMiniLog tbody tr")[2].querySelectorAll('td')[0].innerHTML; """) if date and date[0]: date_string = date[0].replace(' ', '') payee = ghost.evaluate(""" document.querySelectorAll("#historyMiniLog tbody tr")[2].querySelectorAll('td')[1].innerHTML; """) if payee and payee[0]: payee_name = safeFilename(payee[0].replace(' ', '')) if payee_name and date_string: date_object = datetime.strptime(date_string, '%d-%b-%Y') date_string = datetime.strftime(date_object, '%Y-%m-%d') print 'page %d transaction %d [%s - %s]' % ( page_count, transaction_count, date_string, payee_name) purchasedetails_export = os.path.join( EXPORT_DIRECTORY, '%s_%s_%s.png' % (date_string, payee_name, transaction_count)) if not os.path.isfile(purchasedetails_export): print '\t\tsaving to %s' % purchasedetails_export ghost.capture_to(purchasedetails_export, selector="#xptContentMain") else: print '\t\tAlready saved to %s' % purchasedetails_export else: purchasedetails_export = os.path.join( EXPORT_DIRECTORY, 'no date and payee - page-%d_ transaction %d.png' % (page_count, transaction_count)) print '\t\tsaving to %s' % purchasedetails_export if not os.path.isfile(purchasedetails_export): ghost.capture_to(purchasedetails_export, selector="#xptContentMain") else: print '\t\tAlready saved to %s' % purchasedetails_export print 'could not get payee_name and date_string' print '\t\tsaving to %s' % purchasedetails_export getHistoryListing(ghost) goToPage(ghost, page_count) #transaction_list_url = resources[0].url nav_links = ghost.evaluate(nav_links_eval)
import SimpleHTTPServer import SocketServer import threading from ghost import Ghost def serve_one_request(): """ Start up a simple web server serving files in the current directory, handle 1 request, and shut down again. """ handler = SimpleHTTPServer.SimpleHTTPRequestHandler httpd = SocketServer.TCPServer(("", 8010), handler) httpd.handle_request() # Start webserver in new thread t = threading.Thread(target=serve_one_request) t.daemon = True t.start() # Use Ghost to open HTML page ghost = Ghost() page, extra_resources = ghost.open("http://127.0.0.1:8010/simple_form.html") # Capture a screenshot when the HTML page is ready ghost.wait_for_selector("button") ghost.capture_to("simple_form.png", zoom_factor=3.0)
from pyvirtualdisplay import Display from ghost import Ghost from PIL import Image display = Display(visible=0, size=(320,240)) display.start() ghost = Ghost() ghost.open('http://bbc.uk.com') # picture name must end with file type picture = 'imgname.jpg' ghost.capture_to(picture, selector='body') img = Image.open(picture) display.stop()