def grab_url_screenshot(url):
    """
    Grab an url making a screenshot of it
    Filename is SHA256 of url
    :param url:
    :return:
    """

    ret = None

    try:
        # Bonifica url (se manca lo schema assumo http://)
        url_res = urlparse(url)
        if not url_res.scheme:
            url = "http://" + url

        # TODO: Può essere un singleton Ghost?
        ghost = Ghost()
        page, res = ghost.open(url)
        if not page is None and page.http_status == 200:
            url_sha256 = hashlib.sha256(url).hexdigest()
            image_path = os.path.join('url_previews', url_sha256 + ".png")
            full_path = os.path.join(settings.MEDIA_ROOT, image_path)

            ghost.capture_to(full_path)

            image_path = image_path.replace(".png", ".thumb.png")
            thumb_full_path = os.path.join(settings.MEDIA_ROOT,image_path)
            resize_and_crop(full_path, thumb_full_path, (550, 500))
            ret = urljoin(settings.BASE_URL,  "uploads/" + image_path)
        else:
            logger.error("Failed to capture screenshot for {0}".format(url))
    except Exception, e:
        logger.exception(e)
Beispiel #2
0
def main(argv):
    if len(argv) < 2:
        sys.stderr.write("Usage: %s <url>\n" % (argv[0], ))
        return 1

    target = argv[1]
    warc_name = "flashfrozen"

    print("Starting Ghost.py...")
    ghost = Ghost(viewport_size=(1280, 1024), wait_timeout=100)
    # ghost = Ghost(viewport_size=(1280, 1024), display=True)
    # ghost.webview.getSettings().setPluginsEnabled(true);
    print("Loading page:", target)
    page, resources = ghost.open(target)
    #time.sleep(2)
    print("Taking screenshot...")
    ghost.capture_to('original-screenshot.png')
    print("Shutting down Ghost.py...")
    ghost.exit()

    # Extract a list of resource URLs
    print("Extracting URLs...")
    urls = set()
    for r in resources:
        pprint.pprint(r.url)
        print(dir(r.url))
        urls.add(str(r.url))
    if target not in urls:
        urls.add(target)

    # Open pipe to the wget process
    print("Passing URLs to wget...")
    process = subprocess.Popen([
        "wget", "-q", "-i", "-", "-O", "-", "--warc-file={}".format(warc_name)
    ],
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE)
    # Also open a file to hold a note of the URLs
    urlf = open('original-urls.txt', 'w')
    # Pass in the URLs, via STDIN:
    for u in urls:
        print("GOT", u)
        urlf.write("{}\n".format(u))
        #process.stdin.write(bytes("{}\n".format(u),'UTF-8'))
        process.stdin.write("{}\n".format(u))
        #process.stdin.write(u)
    # Close the URLs file:
    urlf.close()
    # Close STDIN so wget knows there are no more URLs coming:
    process.stdin.flush()
    process.stdin.close()
    # This explicitly churns through and ignores STDOUT:
    print("Waiting for wget output...")
    for line in process.stdout:
        pass
    # Wait for the process to finish:
    print("Waiting for wget to finish...")
    process.wait()

    print("Done.")
Beispiel #3
0
def create_screenshot(url, filename, size):
    ghost = Ghost().start()
    ghost.set_viewport_size(size[0], size[1])
    ghost.open(url)
    ghost.wait_for_alert(timeout=40)
    ghost.wait_for_page_loaded(timeout=40)
    ghost.capture_to(filename, selector='#map')
Beispiel #4
0
def index(request):
    dir_name = os.path.dirname(__file__)
    img_name = os.path.join(dir_name, "pic.jpg")

    if request.method == "POST":
        url = request.POST.get("url", "")
    else:
        url = request.GET.get("url", "")

    if not url:
        url = "http://www.bbc.uk.com"

    display = Display()
    display.start()

    ghost = Ghost()
    ghost.open(url)
    width = int(ghost.evaluate("document.body.clientWidth")[0])
    height = int(ghost.evaluate("document.body.clientHeight")[0])

    ghost = Ghost(viewport_size=(width, height))
    ghost.open(url)
    ghost.capture_to(img_name, selector="body")

    image = Image.open(img_name)
    image.thumbnail((128, 128), Image.ANTIALIAS)

    response = HttpResponse(mimetype="image/jpeg")
    image.save(response, "jpeg")

    display.stop()

    return response
Beispiel #5
0
def main(argv):
    if len(argv) < 2:
        sys.stderr.write("Usage: %s <url>\n" % (argv[0],))
        return 1

    target = argv[1]
    warc_name = "flashfrozen"

    print("Starting Ghost.py...")
    ghost = Ghost(viewport_size=(1280, 1024),wait_timeout=100)
    # ghost = Ghost(viewport_size=(1280, 1024), display=True)
    # ghost.webview.getSettings().setPluginsEnabled(true);
    print("Loading page:",target)
    page, resources = ghost.open(target)
    #time.sleep(2)
    print("Taking screenshot...")
    ghost.capture_to('original-screenshot.png')
    print("Shutting down Ghost.py...")
    ghost.exit()

    # Extract a list of resource URLs
    print("Extracting URLs...")
    urls = set()
    for r in resources:
        pprint.pprint(r.url)
        print(dir(r.url))
        urls.add(str(r.url))
    if target not in urls:
        urls.add(target)

    # Open pipe to the wget process
    print("Passing URLs to wget...")
    process = subprocess.Popen(["wget", "-q", 
        "-i", "-", "-O", "-", 
        "--warc-file={}".format(warc_name)]
        ,stdin=subprocess.PIPE,stdout=subprocess.PIPE)
    # Also open a file to hold a note of the URLs
    urlf = open('original-urls.txt','w')
    # Pass in the URLs, via STDIN:
    for u in urls:
        print("GOT", u)
        urlf.write("{}\n".format(u))
        #process.stdin.write(bytes("{}\n".format(u),'UTF-8'))
        process.stdin.write("{}\n".format(u))
        #process.stdin.write(u)
    # Close the URLs file:
    urlf.close()
    # Close STDIN so wget knows there are no more URLs coming:
    process.stdin.flush()
    process.stdin.close()
    # This explicitly churns through and ignores STDOUT:
    print("Waiting for wget output...")
    for line in process.stdout:
        pass
    # Wait for the process to finish:
    print("Waiting for wget to finish...")
    process.wait()

    print("Done.")
Beispiel #6
0
class Browser():
    def __init__(self,width=1024,height=800):
        self.browser = Ghost()
        self.browser.set_viewport_size(width,height)


    def take_screenshot(self, url, path_to_picture):
        self.browser.open(url)
        self.browser.capture_to(path_to_picture, selector='.noncollapsed')
def get_disqus_comments_by_ghost(dictionary):
    '''Uses Ghost to trigger url for iframe,
    then requests to fetch that html,
    and finally get the json from failed-page from disqus.. 
    '''
    ghost = Ghost(
        user_agent=
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0',
        viewport_size=(1349, 765),
        log_level=logging.ERROR)
    # user_agent='Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7', viewport_size = (320, 480)
    page, resources = ghost.open(dictionary['url'])
    assert page.http_status == 200  # make use we get data back..
    ghost.wait_for_page_loaded()  # probably does no harm..

    # comment loaded on scroll hack cred goes to Hammer et al. (2013)
    secs = 0.50
    time.sleep(secs)
    ghost.evaluate("window.scroll(0, 700);")
    ghost.capture_to(
        'scroll_before.png'
    )  # do not get why this fails if i remove this image-capture function...
    time.sleep(secs)
    ghost.evaluate("window.scroll(0, 1400);")
    time.sleep(secs)
    ghost.evaluate("window.scroll(0, 2100);")
    time.sleep(secs)
    ghost.evaluate("window.scroll(0, 4000);")
    time.sleep(secs)
    ghost.wait_for_page_loaded()  #ghost.capture_to('scroll_after.png')
    logger.info("waiting for selector IFRAME")
    ghost.wait_for_selector("iframe")  ##post-list

    #    print ghost.content
    soup = BeautifulSoup(ghost.content)
    try:
        comments_iframe_url = soup.select("iframe#dsq-2")[0][
            'src']  # only one of these...
        # headers = {
        #     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0',
        #     'X-UA-Compatible': 'IE=edge'
        # }
        comments_html = requests.get(comments_iframe_url)  # , headers=headers
        #print comments_html.text
        iframe_soup = BeautifulSoup(comments_html.text)
        posts = iframe_soup.select("#disqus-threadData")
        data = json.loads(posts[0].text)

        #print type(data)
        return data['response']['thread']['posts']
    except:
        # fetching comments failed
        return -9999
Beispiel #8
0
def main(argv):
    if len(argv) < 2:
        sys.stderr.write("Usage: %s <url>\n" % (argv[0],))
        return 1

    ghost = Ghost(viewport_size=(1280, 1024))
    page, resources = ghost.open(argv[1])
    assert page.http_status==200 and 'bbc' in ghost.content
    ghost.capture_to('screenshot2.png')

    for r in resources:
      print r.url
def screenshot(url,target):
		ghost = Ghost(wait_timeout=4)
		print "Do u want to provide any credentials \n"
		choice=raw_input()
		if choice.lower()=='y':
			print colored("[-] Enter Username and Password ",'green')
			username=raw_input()
			password=raw_input()
			ghost.open(url,auth=(username,password))
			ghost.capture_to(str(time.time())+'.png')
			os.system('mv *.png ./screenshots')
			print colored("[-] Screenshot Succesfull catpured and Saved @ %s/screen.png"%(os.getcwd()),'green')
		else:
			pass
def site_capture():
	print(sys.argv[1])
	print(sys.argv[2])
	
	try:
		g = Ghost()
		r = g.open(sys.argv[1])
		g.capture_to(sys.argv[2])

		g.exit()
		print('success')
	except Exception, e:
		print('fail')
		pass
Beispiel #11
0
def render_notext(url):
  sitename = get_sitename(url)
  ghost = Ghost()
  ghost.open(url, headers={
    "Accept":"image/webp,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Connection": "keep-alive",
    })
 
  ghost.wait_for_page_loaded()
  #time.sleep(10)
  ghost.capture_to(os.path.join(path,sitename +'.png'),(0,0,1280,700))
  return sitename
  print "success"
Beispiel #12
0
def site_capture():
    print(sys.argv[1])
    print(sys.argv[2])

    try:
        g = Ghost()
        r = g.open(sys.argv[1])
        g.capture_to(sys.argv[2])

        g.exit()
        print('success')
    except Exception, e:
        print('fail')
        pass
Beispiel #13
0
def ghostCapture(screen_url,web_timeout=20):
    import time
    gh = Ghost(display=":99",wait_timeout=int(web_timeout),
               viewport_size=(x2,y2), ignore_ssl_errors=True,
               log_level=logging.FATAL)
               
    ghost_page,resources= gh.open(screen_url, 
                auth=('none', 'none'))
    
    img_path=mkstemp(suffix=".png")[1]
    gh.set_viewport_size(x2,y2)
    time.sleep(timeout)
    gh.capture_to(img_path,region=(x1,y1,x2,y2))

    return img_path
Beispiel #14
0
def make_pages(pn, c):
         
    #global ghost
    ghost = Ghost(download_images = True)

    #time.sleep( 1 )

    root = app_root + '/cache'
    file = "page_" + pn + "_" + str(c) +  ".jpg"

    markup = get_markup(pn, True)

    ghost.main_frame.setHtml(markup)
    ghost.wait_for_page_loaded()
    ghost.capture_to(root + '/' + file)
Beispiel #15
0
def main(argv):
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('url', help='URL to render and download')
    parser.add_argument('-d', action='store_true',
                        help='Download WARC, save urls, and save screenshot to default names')
    parser.add_argument('--warc',
                        help='Uses wget to save the page to a WARC file')
    parser.add_argument('--save-urls',
                        type=argparse.FileType('w'),
                        help='Save the urls to a file but do not run wget')
    parser.add_argument('--screenshot',
                        help='Save a screenshot of the website')
    parser.add_argument('--display', action='store_true',
                        help='Display the webkit window while downloading')

    args = parser.parse_args(argv[1:])

    target = args.url
    if args.d is True:
        args.screenshot = 'original-screenshot.png'
        args.warc = 'flashfrozen'
        args.save_urls = open('original-urls.txt', 'wb')

    print "Starting Ghost.py..."
    ghost = Ghost(viewport_size=(1280, 1024), display=args.display, wait_timeout=90)
    # ghost.webview.getSettings().setPluginsEnabled(true);
    print "Loading page:", target
    page, resources = ghost.open(target)
    #time.sleep(2)
    if args.screenshot is not None:
        print "Taking screenshot..."
        ghost.capture_to(args.screenshot)
    print "Shutting down Ghost.py..."
    ghost.exit()

    urls = urls_from_ghost_resources(resources)
    urls.add(target)

    if args.save_urls is not None:
        for u in urls:
            args.save_urls.write("{}\n".format(u))
        args.save_urls.close()

    if args.warc is not None:
        print "Passing URLs to wget..."
        run_wget(urls, args.warc)

    print "Done."
Beispiel #16
0
class Shotter(object):

    def __init__(self,):
        try:
            #self.ghost = Ghost(wait_timeout=10)
            self.ghost = Ghost()
        except ImportError:
            self.ghost = None

    def cuty_shot(self, url, filename, x11=True, width=1024, height=768, colorbits=24):
        #TODO: add check if cutycapt installed
        if x11:
            cmd = 'cutycapt --url="%s" --out=%s' % (url, filename)
        else:
            cmd = 'xvfb-run --server-args="-screen 0, %ix%ix%i" cutycapt --url="%s" --out=%s' % \
                  (url, filename, width, height, colorbits)
        try:
            Popen(cmd, shell=True).wait()
            return True
        except:
            return False

    def ghost_shot(self, url, filename, ignore_errors=True):
        #print('ghost_shot(%s)' % url)
        try:
            page, resources = self.ghost.open(url)
            if ignore_errors:
                self.ghost.capture_to(filename)
                return True
            elif page.http_status == 200 and page.totalBytes() != 0:
                self.ghost.capture_to(filename)
                return True
            else:
                return False
        except:
            print(exc_info())
            return False

    def screenshot(self, url, filename, overwrite=False):
        if path.exists(filename) and not overwrite:
            print('%s exists, skipping' % filename)
            return

        print('[SCREENSHOT] %s -> %s' % (url, filename))
        if self.ghost is not None:
            self.ghost_shot(url, filename)
        else:
            self.cuty_shot(url, filename)
Beispiel #17
0
def screenshot(url, target):
    ghost = Ghost(wait_timeout=4)
    print "Do u want to provide any credentials \n"
    choice = raw_input()
    if choice.lower() == 'y':
        print colored("[-] Enter Username and Password ", 'green')
        username = raw_input()
        password = raw_input()
        ghost.open(url, auth=(username, password))
        ghost.capture_to(str(time.time()) + '.png')
        os.system('mv *.png ./screenshots')
        print colored(
            "[-] Screenshot Succesfull catpured and Saved @ %s/screen.png" %
            (os.getcwd()), 'green')
    else:
        pass
Beispiel #18
0
def snap(conn, url, cookie_name, cookie_value, width, height, loaded, hides, selector):
    """Handle all the work of taking the page snapshot.

    The first parameter is a connection object for a `multiprocessing.Pipe`
    that we use to send back the file name written to. The remaining parameters
    are `multiprocessing.Value`s.
    
    """
    ghost = None
    try:
        ghost = Ghost(viewport_size=(width.value, height.value))
        ghost.wait_timeout = 20

        headers = {}
        if cookie_name.value and cookie_value.value:
            headers = {
                'Cookie': str('%s=%s' % (cookie_name.value, cookie_value.value)),
            }

        ghost.open(url.value, headers=headers)

        if loaded.value:
            try:
                ghost.wait_for_selector('%s' % loaded.value)
            except:
                # if the selector never appears, we don't care
                pass

        selectors = hides.value.split(',')
        if len(selectors):
            hide_js = r'''
                if (jQuery) {
                    $(document).ready(function() {
                        %s
                    });
                }
            ''' % '\n'.join([r"$('%s').hide();" % sel for sel in selectors])
            ghost.evaluate(hide_js)

        handle, file_path = mkstemp(prefix='ansel_snap', suffix='.png')
        ghost.capture_to(file_path, selector=selector.value)

        conn.send(file_path)
    finally:
        del ghost
        conn.close()
Beispiel #19
0
def get_disqus_comments_by_ghost(dictionary):
    '''Uses Ghost to trigger url for iframe,
    then requests to fetch that html,
    and finally get the json from failed-page from disqus.. 
    '''
    ghost = Ghost(user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', viewport_size = (1349, 765), log_level=logging.ERROR)
    # user_agent='Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7', viewport_size = (320, 480) 
    page, resources = ghost.open(dictionary['url'])
    assert page.http_status == 200      # make use we get data back..
    ghost.wait_for_page_loaded()        # probably does no harm..

    # comment loaded on scroll hack cred goes to Hammer et al. (2013)
    secs = 0.50
    time.sleep(secs)
    ghost.evaluate("window.scroll(0, 700);")
    ghost.capture_to('scroll_before.png')       # do not get why this fails if i remove this image-capture function...
    time.sleep(secs)
    ghost.evaluate("window.scroll(0, 1400);")
    time.sleep(secs)
    ghost.evaluate("window.scroll(0, 2100);")
    time.sleep(secs)
    ghost.evaluate("window.scroll(0, 4000);")
    time.sleep(secs)
    ghost.wait_for_page_loaded()                #ghost.capture_to('scroll_after.png')
    logger.info("waiting for selector IFRAME")
    ghost.wait_for_selector("iframe") ##post-list

#    print ghost.content
    soup = BeautifulSoup(ghost.content)
    try:
        comments_iframe_url = soup.select("iframe#dsq-2")[0]['src'] # only one of these...
        # headers = {
        #     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0',
        #     'X-UA-Compatible': 'IE=edge'
        # }
        comments_html = requests.get(comments_iframe_url) # , headers=headers
        #print comments_html.text
        iframe_soup = BeautifulSoup(comments_html.text)
        posts = iframe_soup.select("#disqus-threadData")
        data = json.loads(posts[0].text)
        
        #print type(data)
        return data['response']['thread']['posts']
    except:
        # fetching comments failed
        return -9999
Beispiel #20
0
def heroku(api_key, year, month):
    ghost = None

    file_path = '/tmp/heroku_invoice_%d-%d.png' % (year, month)
    if os.path.exists(file_path):
        os.remove(file_path)

    # TODO: make dimensions configurable? Automatic (is that possible?)?
    ghost = Ghost(viewport_size=(1000, 1600))
    ghost.wait_timeout = 20

    ghost.open('https://id.heroku.com/login')
    ghost.fill("form", dict(email="", password=api_key))
    ghost.fire_on("form", "submit", expect_loading=True)
    ghost.open('https://dashboard.heroku.com/invoices/%s/%s' % (year, month))

    ghost.capture_to(file_path)

    return file_path
Beispiel #21
0
def screenshot(url,creds=False):
	"""
	Grab Screenshot of Web Interface

	"""
	try:
		ghost = Ghost(wait_timeout=4)
		if creds:
			ghost.open(url,auth=(creds.split(':')[0],creds.split(':')[1]))
			ghost.capture_to(str(time.time())+'.png')
			os.system('mv *.png ./screenshots')
			print colored("\n[+] Screenshot Succesfull Catpured and Saved @ %s/screen.png"%(os.getcwd()),'green')
		else:
			ghost.open(url)
			ghost.capture_to('screen'+str(time.time())+'.png')
			os.system('mv *.png ./screenshots')
			print colored("\n[+] Screenshot Succesfull Catpured and Saved @ %s/screen.png"%(os.getcwd()),'green')
	except Exception as e:
		print colored("\n[-] Screenshot failed Error : "+str(e),'red')
Beispiel #22
0
class WxGhost(object):
	def __init__(self):
		self.ghost = Ghost(log_level=logging.CRITICAL).start()
		self.ghost.download_images = False
		try:
		    self.ghost.load_cookies("cookie.txt")
		    print 'load cookie'
		except IOError:
			print 'load cookie error'
		self.ghost.show()

	def handle_frequency(self):
		if u"您的访问过于频繁" in self.ghost.content:
			print 'frequency'
			self.ghost.show()
			self.ghost.capture_to("seccode.png", selector="#seccodeImage")
			self.ghost.wait_for_text(u'以下内容来自微信公众号', timeout=1800)  # 输入验证码
			self.ghost.save_cookies("cookie.txt")

	def open(self, url):
		try:
			self.ghost.open(url)
			self.handle_frequency()
		except TimeoutError:
			print 'timeout when open'
			return False
		return True

	def evaluate(self, js, expect_loading=True):
		try:
			self.ghost.evaluate(js, expect_loading=expect_loading)
			self.handle_frequency()
		except TimeoutError:
			return False
		return True

	def sleep(self, value):
		self.ghost.sleep(value)

	def get_lxml(self):
		return lxml.html.fromstring(self.ghost.content)
Beispiel #23
0
class IDirectBroker(object):
    
    IDIRECTURL = 'https://secure.icicidirect.com/Trading/LBS/Logon.asp'

    def __init__(self,username, password, **kwargs):
        self.username = username
        self.password = password
        self.ghost = Ghost()



        try:
            self.page, self.resources = self.ghost.open(self.IDIRECTURL)
            
            self.ghost.wait_for_page_loaded()
            self.ghost.capture_to("./l1.png")
            
            result, resources = self.ghost.fill("form", { "FML_USR_ID": "MUSE9L71", "FML_USR_USR_PSSWRD": "Infotech@8","FML_USR_DT_BRTH":"22101982" })
            self.ghost.capture_to("./l2.png")
            self.page, self.resources = self.ghost.fire("form", "submit", expect_loading=True)
            #self.ghost.wait_for_page_loaded()
            self.ghost.capture_to("./l3.png")

        except Exception,e:
            raise e
def RunExport():
    ghost = Ghost(viewport_size=(1200, 2400), display=False, wait_timeout=30, cache_dir=CACHE_DIRECTORY)#, log_level=logging.ERROR

    #
    #login_password
    #submit.x
    #submit
    page, resources = ghost.open('https://www.paypal.com/ie/cgi-bin/webscr?cmd=_login-run')

    result, resources = ghost.fill("form[name=login_form]", {
            "login_email": PAYPAL_USERNAME,
            "login_password": PAYPAL_PASSWORD
        })
    page, resources = ghost.fire_on("form[name=login_form]", "submit", expect_loading=True)
    result, resources = ghost.wait_for_page_loaded()
    #wait for 10 seconds
    #time.sleep(10)

    page, resources = ghost.open('https://www.paypal.com/ie/cgi-bin/webscr?cmd=_account')


    result, resources = ghost.wait_for_text("Welcome, %s" % PAYPAL_NAME)


    getHistoryListing(ghost)

    first_run = True
    #get the next url
    #print ghost.evaluate('document.querySelectorAll("#tableWrapperID .pagination:nth-child(1) a.btnLink");')[0]
    nav_links_eval = """
                      var links = document.querySelectorAll(".pagination a.btnLink");
                        links.length;
                    """
    nav_links = ghost.evaluate(nav_links_eval)
    page_count = START_AT_PAGE
    transaction_count = 0
    if page_count > 0:
        transaction_count = page_count * 20

    goToPage(ghost,page_count)

    #transaction_list_url = resources[0].url
    #print transaction_list_url
    while nav_links[0] > 0 or first_run==True:
        first_run = False

        page_count = page_count + 1

        filteredlisting_export = os.path.join(EXPORT_DIRECTORY,'filteredhistory%d.png' % page_count)
        if not os.path.isfile(filteredlisting_export):
            ghost.capture_to(filteredlisting_export, selector="body")

        transaction_urls = ghost.evaluate("""
                            var links = document.querySelectorAll("#transactionTable tr.primary td.detailsNoPrint a");
                            var listRet = [];
                            for (var i=0; i<links.length; i++){
                                listRet.push(links[i].href);
                            }
                            listRet;
                            """)


        for transaction_href in transaction_urls[0]:
            transaction_count = transaction_count + 1
            #print urllib.unquote(transaction_href)

            page, resources = ghost.open(urllib.unquote(transaction_href))
            ghost.wait_for_page_loaded()
            payee_name = None
            date_string = None
            date = ghost.evaluate("""
                           document.querySelectorAll("#historyMiniLog tbody tr")[2].querySelectorAll('td')[0].innerHTML;
                        """)
            if date and date[0]:
                date_string = date[0].replace('&nbsp;','')

            payee = ghost.evaluate("""
                           document.querySelectorAll("#historyMiniLog tbody tr")[2].querySelectorAll('td')[1].innerHTML;
                        """)
            if payee and payee[0]:
                payee_name = safeFilename(payee[0].replace('&nbsp;',''))

            if payee_name and date_string:

                date_object = datetime.strptime(date_string, '%d-%b-%Y')
                date_string=datetime.strftime(date_object,'%Y-%m-%d')
                print 'page %d transaction %d [%s - %s]' % (page_count, transaction_count, date_string, payee_name)

                purchasedetails_export = os.path.join(EXPORT_DIRECTORY,'%s_%s_%s.png' % (date_string,payee_name,transaction_count ))

                if not os.path.isfile(purchasedetails_export):
                    print '\t\tsaving to %s' % purchasedetails_export
                    ghost.capture_to(purchasedetails_export, selector="#xptContentMain")
                else:
                    print '\t\tAlready saved to %s' % purchasedetails_export

            else:
                purchasedetails_export = os.path.join(EXPORT_DIRECTORY,'no date and payee - page-%d_ transaction %d.png' % (page_count,transaction_count ))
                print '\t\tsaving to %s' % purchasedetails_export
                if not os.path.isfile(purchasedetails_export):
                    ghost.capture_to(purchasedetails_export, selector="#xptContentMain")
                else:
                    print '\t\tAlready saved to %s' % purchasedetails_export

                print 'could not get payee_name and date_string'
                print '\t\tsaving to %s' % purchasedetails_export



        getHistoryListing(ghost)

        goToPage(ghost,page_count)
        #transaction_list_url = resources[0].url
        nav_links = ghost.evaluate(nav_links_eval)
Beispiel #25
0
 def save_screenshot(self, url, path):
     ghost = Ghost(wait_timeout=120)
     page, extra = ghost.open(url)
     ghost.capture_to(path)
def create_png(input, output):
    ghost = Ghost()
    page, extra_resources = ghost.open(input)
    ghost.capture_to(output)
import SimpleHTTPServer
import SocketServer
import threading

from ghost import Ghost


def serve_one_request():
    """
		Start up a simple web server serving files in the current directory, 
		handle 1 request, and shut down again. 
	"""
    handler = SimpleHTTPServer.SimpleHTTPRequestHandler
    httpd = SocketServer.TCPServer(("", 8010), handler)
    httpd.handle_request()


# Start webserver in new thread
t = threading.Thread(target=serve_one_request)
t.daemon = True
t.start()

# Use Ghost to open HTML page
ghost = Ghost()
page, extra_resources = ghost.open("http://127.0.0.1:8010/simple_form.html")

# Capture a screenshot when the HTML page is ready
ghost.wait_for_selector('button')
ghost.capture_to('simple_form.png', zoom_factor=3.0)
Beispiel #28
0
def fetch_website(url, user_agent, results_location_dir):
    """function to use for website fetch

    :param url: url to fetch information from
    :param user_agent: user agent string that is used by the minion in making the fetch
    :param results_location_dir: the location to where the results are stored
    :return: results_data - a dictionary of metadata on the fetch

    This method uses a different library than the basic fetch method, Ghost.py (documentation at
    http://ghost-py.readthedocs.io/en/latest/#). After cleaning the url, a session is opened with the user agent string
    passed in. Then the specific web page is opened and all the resources of the web page are collected. After that, a
    screen-shot of the web page is collected. Then, the page data is written to a file that is named from
    the session id. Then each resource gathered during the fetch is written to a file, and these are placed in the same
    directory as the page data. Beyond that, miscellaneous metadata is written to the results_data dictionary.
    """
    log_debug("fetch_website", "Entering fetch_website")

    # clean the url
    url_clean = url.lstrip()

    log_debug("fetch_website", "Starting Fetch of: " + url_clean)

    # start a Ghost.py session
    session = Ghost().start(user_agent=user_agent)

    results_data = {'requested_url': url,
                    'actual_url': url_clean,
                    'remote_job_id': str(session.id)}
    try:
        # open the web page and gather all the page's resources
        page, resources = session.open(address=url_clean, user_agent=user_agent)

    # catch a TimeoutError
    except (ghost.TimeoutError, ghost.Error):
        results_data['connection_success'] = False
        log_debug("fetch_website", "Connection Failed for Fetch: " + url_clean)
        return results_data

    except Exception as e:
        print type(e)
        print str(e)
        return results_data

    # if page is None and there are no resources, that means that a connection to the page failed
    if page is None and len(resources) == 0:
        log_debug("fetch_website", "")
        results_data['connection_success'] = False

    else:
        netloc = urlparse(url_clean).netloc
        log_debug("fetch_website", "Attempting to capture screenshot of {}".format(netloc))

        try:
            # capture a screen-shot of the web page
            session.capture_to("{}/{}.png".format(results_location_dir, netloc))

            log_debug("fetch_website", "Successful capture of screenshot of {}".format(netloc))

        except Exception as e:
            log_debug("fetch_website", "Failed to capture screenshot of {}".format(netloc))

            print type(e)
            print str(e)

        try:
            log_debug("fetch_website", "Opening: {}/{} for: {}".format(results_location_dir, session.id, url_clean))
            fetch_file = open("{}/{}".format(results_location_dir, session.id), 'w')

            log_debug("fetch_website", "writing page content to file")

            # write page content to file
            fetch_file.write(page.content)

            log_debug("fetch_website", "closing {}".format(session.id))
            fetch_file.close()

            # write the data of each resource to different files
            for resource in resources:
                log_debug("fetch_website", "opening {}/resource{} for: {}".format(results_location_dir,
                                                                                  resources.index(resource),
                                                                                  url_clean))
                data_file = open("{}/resource{}".format(results_location_dir, resources.index(resource)), "w")

                log_debug("fetch_website", "writing content to {}".format(resources.index(resource)))
                data_file.write(resource.content)

                log_debug("fetch_website", "closing {}".format(resources.index(resource)))
                data_file.close()

            results_data['fetch_object_success'] = True

        except:
            results_data['fetch_object_success'] = False

        finally:
            # collect more metadata
            results_data['connection_success'] = True
            results_data['server_info'] = dict(page.headers)
            results_data['response_code'] = page.http_status

            if page.http_status in [400, 404, 403, 401]:
                results_data["fetch_success"] = False

            if len(session.cookies) > 0:
                results_data['cookies'] = [x.value().data() for x in session.cookies]

            return results_data
Usage:
python django_admin_login.py <username> <password>
(where the username and password are entered into the corresponding boxes)
"""
import sys

from ghost import Ghost
import Image

username = sys.argv[1]
password = sys.argv[2]
django_admin_url = "http://127.0.0.1:8000/admin/"
image_file = "django_admin_login.png"

ghost = Ghost()
page, extra_resources = ghost.open(django_admin_url)

ghost.wait_for_selector('#id_username')
ghost.wait_for_selector('#id_password')
ghost.fill("#login-form", {'username': username, 'password': password})

ghost.capture_to(image_file, zoom_factor=3.0)
print "Captured django_admin_login.png"

# Crop bad space at the top due to the odd page layout
im = Image.open(image_file)
box = (0, 300, 1014, 1062)
region = im.crop(box)
region.save(image_file)
Beispiel #30
0
from ghost import Ghost

url = "http://www.ebay.com/"
gh = Ghost()

# We load the main page of ebay
page, resources = gh.open(url, wait_onload_event=True)

# Full the main bar and click on the search button
gh.set_field_value("#gh-ac", "plane")
gh.click("#gh-btn")

# Wait for the next page
gh.wait_for_selector("#e1-15")

# Save the image of the screen
gh.capture_to("plane.png")
Beispiel #31
0
import secrets
import time
from ghost import Ghost

ghost = Ghost()

page, resources = ghost.open('https://my.ecofactor.com/mobile/login.html')

result, resources = ghost.fill('form', {
    'j_username': secrets.username,
    'j_password': secrets.password
})

page, resources = ghost.fire_on('form', 'submit')
#page, resources = ghost.fire_on('form', 'submit', expect_loading=True)

page, resources = ghost.wait_for_page_loaded()

ghost.capture_to("/tmp/foo.png")


print page.http_status
#print resources

#result, resources = ghost.evaluate("document.getElementById('apDiv2').style.display = 'none';")


#result, resources = ghost.evaluate(
#    "$(document).ready(function() { var target = $('#apDiv2').show(); target = target.add(target.parentsUntil('body')).siblings().hide(); }); ")

#ghost.evaluate("window.document.getElementById('apDiv2').style.visibility = 'hidden'; ")
#ghost.evaluate("window.document.getElementById('apDiv2').style.visibility = 'hidden'; ")
#ghost.evaluate("$('div:not(#myDiv)').show();")
#ghost.evaluate("v$('div:not(#myDiv)').show();")


#### ghost.evaluate("window.scroll(0, 2000);")



#result, resources = ghost.evaluate(
#    "alert('Okk');")

#result, resources = ghost.wait_for_alert()

#$(document).ready(function() {
#      alert("document ready occurred!");
#});

ghost.capture_to(imageName+'.png')
#ghost.capture_to('header.png')
# assert page.http_status==200 and 'jeanphix' in ghost.content
def take_screenshot_of_url(url, target_filename):
    ghost = Ghost()
    page, resources = ghost.open(url)
    ghost.capture_to(target_filename)
Beispiel #34
0
Usage:
python django_admin_login.py <username> <password>
(where the username and password are entered into the corresponding boxes)
"""
import sys

from ghost import Ghost
import Image

username = sys.argv[1]
password = sys.argv[2]
django_admin_url = "http://127.0.0.1:8000/admin/"
image_file = "django_admin_login.png"

ghost = Ghost()
page, extra_resources = ghost.open(django_admin_url)

ghost.wait_for_selector('#id_username')
ghost.wait_for_selector('#id_password')
ghost.fill("#login-form", {'username': username, 'password': password})

ghost.capture_to(image_file, zoom_factor=3.0)
print "Captured django_admin_login.png"

# Crop bad space at the top due to the odd page layout
im = Image.open(image_file)
box = (0, 300, 1014, 1062)
region = im.crop(box)
region.save(image_file)
Beispiel #35
0
def main():
    ghost = Ghost()
    page, resources = ghost.open('http://google.com')
    ghost.capture_to('header.png', selector="body")
def create_png(input, output):
    ghost = Ghost()
    page, extra_resources = ghost.open(input)
    ghost.capture_to(output)
def RunExport():
    ghost = Ghost(viewport_size=(1200, 2400),
                  display=False,
                  wait_timeout=30,
                  cache_dir=CACHE_DIRECTORY)  #, log_level=logging.ERROR

    #
    #login_password
    #submit.x
    #submit
    page, resources = ghost.open(
        'https://www.paypal.com/ie/cgi-bin/webscr?cmd=_login-run')

    result, resources = ghost.fill("form[name=login_form]", {
        "login_email": PAYPAL_USERNAME,
        "login_password": PAYPAL_PASSWORD
    })
    page, resources = ghost.fire_on("form[name=login_form]",
                                    "submit",
                                    expect_loading=True)
    result, resources = ghost.wait_for_page_loaded()
    #wait for 10 seconds
    #time.sleep(10)

    page, resources = ghost.open(
        'https://www.paypal.com/ie/cgi-bin/webscr?cmd=_account')

    result, resources = ghost.wait_for_text("Welcome, %s" % PAYPAL_NAME)

    getHistoryListing(ghost)

    first_run = True
    #get the next url
    #print ghost.evaluate('document.querySelectorAll("#tableWrapperID .pagination:nth-child(1) a.btnLink");')[0]
    nav_links_eval = """
                      var links = document.querySelectorAll(".pagination a.btnLink");
                        links.length;
                    """
    nav_links = ghost.evaluate(nav_links_eval)
    page_count = START_AT_PAGE
    transaction_count = 0
    if page_count > 0:
        transaction_count = page_count * 20

    goToPage(ghost, page_count)

    #transaction_list_url = resources[0].url
    #print transaction_list_url
    while nav_links[0] > 0 or first_run == True:
        first_run = False

        page_count = page_count + 1

        filteredlisting_export = os.path.join(
            EXPORT_DIRECTORY, 'filteredhistory%d.png' % page_count)
        if not os.path.isfile(filteredlisting_export):
            ghost.capture_to(filteredlisting_export, selector="body")

        transaction_urls = ghost.evaluate("""
                            var links = document.querySelectorAll("#transactionTable tr.primary td.detailsNoPrint a");
                            var listRet = [];
                            for (var i=0; i<links.length; i++){
                                listRet.push(links[i].href);
                            }
                            listRet;
                            """)

        for transaction_href in transaction_urls[0]:
            transaction_count = transaction_count + 1
            #print urllib.unquote(transaction_href)

            page, resources = ghost.open(urllib.unquote(transaction_href))
            ghost.wait_for_page_loaded()
            payee_name = None
            date_string = None
            date = ghost.evaluate("""
                           document.querySelectorAll("#historyMiniLog tbody tr")[2].querySelectorAll('td')[0].innerHTML;
                        """)
            if date and date[0]:
                date_string = date[0].replace('&nbsp;', '')

            payee = ghost.evaluate("""
                           document.querySelectorAll("#historyMiniLog tbody tr")[2].querySelectorAll('td')[1].innerHTML;
                        """)
            if payee and payee[0]:
                payee_name = safeFilename(payee[0].replace('&nbsp;', ''))

            if payee_name and date_string:

                date_object = datetime.strptime(date_string, '%d-%b-%Y')
                date_string = datetime.strftime(date_object, '%Y-%m-%d')
                print 'page %d transaction %d [%s - %s]' % (
                    page_count, transaction_count, date_string, payee_name)

                purchasedetails_export = os.path.join(
                    EXPORT_DIRECTORY, '%s_%s_%s.png' %
                    (date_string, payee_name, transaction_count))

                if not os.path.isfile(purchasedetails_export):
                    print '\t\tsaving to %s' % purchasedetails_export
                    ghost.capture_to(purchasedetails_export,
                                     selector="#xptContentMain")
                else:
                    print '\t\tAlready saved to %s' % purchasedetails_export

            else:
                purchasedetails_export = os.path.join(
                    EXPORT_DIRECTORY,
                    'no date and payee - page-%d_ transaction %d.png' %
                    (page_count, transaction_count))
                print '\t\tsaving to %s' % purchasedetails_export
                if not os.path.isfile(purchasedetails_export):
                    ghost.capture_to(purchasedetails_export,
                                     selector="#xptContentMain")
                else:
                    print '\t\tAlready saved to %s' % purchasedetails_export

                print 'could not get payee_name and date_string'
                print '\t\tsaving to %s' % purchasedetails_export

        getHistoryListing(ghost)

        goToPage(ghost, page_count)
        #transaction_list_url = resources[0].url
        nav_links = ghost.evaluate(nav_links_eval)
import SimpleHTTPServer
import SocketServer
import threading

from ghost import Ghost


def serve_one_request():
    """
		Start up a simple web server serving files in the current directory, 
		handle 1 request, and shut down again. 
	"""
    handler = SimpleHTTPServer.SimpleHTTPRequestHandler
    httpd = SocketServer.TCPServer(("", 8010), handler)
    httpd.handle_request()


# Start webserver in new thread
t = threading.Thread(target=serve_one_request)
t.daemon = True
t.start()

# Use Ghost to open HTML page
ghost = Ghost()
page, extra_resources = ghost.open("http://127.0.0.1:8010/simple_form.html")

# Capture a screenshot when the HTML page is ready
ghost.wait_for_selector("button")
ghost.capture_to("simple_form.png", zoom_factor=3.0)
Beispiel #39
0
from pyvirtualdisplay import Display
from ghost import Ghost
from PIL import Image

display = Display(visible=0, size=(320,240))
display.start()

ghost = Ghost()
ghost.open('http://bbc.uk.com')

# picture name must end with file type
picture = 'imgname.jpg'


ghost.capture_to(picture, selector='body')

img = Image.open(picture)

display.stop()