Ejemplo n.º 1
0
 def process_request(self, request, spider):
     if spider.name in WEBKIT_DOWNLOADER:
         gh = Ghost()
         se = Session(gh, download_images=False)
         se.open(request.url)
         result, resource = se.evaluate(
             'document.documentElement.innerHTML')
         spider.webkit_se = se
         renderedBody = str(resource).encode('utf8')
         return HtmlResponse(request.url, body=renderedBody)
Ejemplo n.º 2
0
def run3():
    gh = Ghost()
    ss = Session(gh, display=True)

    count = 0
    location = 0
    ss.open('https://edition.cnn.com/election/2016/results/exit-polls/arizona/president')
    ss.wait_timeout()

    html3 = ss.content.encode('utf-8')
    patten = re.compile(r'<td class="exit-poll__cell">', re.M)
Ejemplo n.º 3
0
def round_trip(DepartCity, ReturnCity, departDate, returnDate, debug=0):
    #global se
    start_time = datetime.now()
    url = 'http://flights.ctrip.com/international/round-%s-%s-%s-%s?%s&%s&y_s' % (
        DepartCity, ReturnCity, code(DepartCity), code(ReturnCity), departDate,
        returnDate)
    #print(url)
    ctrip_access = False
    while ctrip_access == False:
        se = Session(Ghost(),
                     wait_timeout=30,
                     wait_callback=None,
                     display=True,
                     viewport_size=(800, 680),
                     download_images=False)
        se.delete_cookies()
        proxy = choice(proxypool)
        se.set_proxy(proxy[0], proxy[1], int(proxy[2]))
        try:
            se.open(url, user_agent=choice(ua_list))
            #print('已打开 %s' % url)
        except:
            se.exit()
            del se
            proxypool.remove(proxy)
            print("blacklist %s" % proxy[1])
            continue
        ctrip_access = se.exists('li:nth-child(5) > span')
        if ctrip_access == False:
            se.exit()
            del se
            proxypool.remove(proxy)
            print("blacklist %s" % proxy[1])
    se.click('#sortControls > ul > li:nth-child(5) > span')
    if se.exists('i.icon-reverse') == True:
        se.click('#sortControls > ul > li:nth-child(5) > span')
    se.wait_while_selector('#FI_progBar', timeout=20)
    #print('Loading finished!')
    se.sleep(0.2)
    html = se.content
    soup = BeautifulSoup(html, "html.parser")
    source = soup.select('#flightList > div')
    if debug == 1:
        return source
    lowest = source[0].select('span.price2')[0].text
    end_time = datetime.now()
    timedelsta = (end_time - start_time).seconds
    print('%s-%s往返 %s去 %s回 最低价%s 搜索耗时%s秒' %
          (DepartCity, ReturnCity, departDate, returnDate, lowest, timedelsta))
    se.exit()
    del se
    price = lowest[1:]
    insert_price(DepartCity, ReturnCity, departDate, returnDate, price)
Ejemplo n.º 4
0
def login_qq():
    global se
    ua_m = 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_1_1 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0 Mobile/15B150 Safari/604.1'
    se = Session(Ghost(),
                 user_agent=ua_m,
                 wait_timeout=30,
                 wait_callback=None,
                 display=True,
                 viewport_size=(375, 553),
                 download_images=True)
    url = 'https://ui.ptlogin2.qq.com/cgi-bin/login?style=38&appid=728041403&s_url=https%3A%2F%2Finfoapp.3g.qq.com%2Fg%2Flogin%2Fproxy.jsp%3FsourceUrl%3Dhttps%25253A%25252F%25252Fportal.3g.qq.com%25252F%25253F_r%25253D0.2646472700205946%252526aid%25253Dindex%252526g_f%25253D1283&target=self&low_login=1&low_login_hour=4321&daid=261&islogin=false&uid=-8794356048489038000'
    se.open(url)
    se.set_field_value('#u', '2873723285')
    se.set_field_value('#p', 'tz1006')
    se.click('#go', expect_loading=True)
Ejemplo n.º 5
0
class YoukuGhostDriver(object):
    def __init__(self, host, port, timeout):
        #url = 'http://111.161.35.198:12210/youku_ghost.html'
        url = 'http://%s:%s/youku_ghost.html' % (host, port)
        self.ghost = Ghost()
        self.session = Session(self.ghost,
                               wait_timeout=timeout,
                               plugins_enabled=True)
        self.session.open(url)

    def parse(self, vid):
        try:
            res = []
            self.session.evaluate('window.getPlayUrl("%s")' % vid)
            success, resources = self.session.wait_for_selector('div[id="ck"]')
            if success:
                ck = self.session.evaluate(
                    'document.getElementById("ck").innerHTML')
                res = ck[0]

        except Exception, e:
            log.app_log.error(traceback.format_exc())

        finally:
Ejemplo n.º 6
0
#!/usr/bin/env python
#coding:utf-8

from ghost import Ghost, Session
import time
gh = Ghost()
se = Session(gh, display = True)
se.open("https://www.baidu.com/")
time.sleep(10)

Ejemplo n.º 7
0
        se.click('tbody > tr:nth-child(%s) > td:nth-child(%s)' % (m, n))
    # vcode
    se.capture_to('s/vcode.png', selector='#ticketImg')
    image = Image.open('s/vcode.png')
    vcode = pytesseract.image_to_string(image)
    se.set_field_value('#ticket', vcode)
    se.sleep(0.1)
    se.click('#submit', expect_loading=True)


login(username, password)

    
    
url = 'https://trade.cgws.com/cgi-bin/user/Login'
se.open(url)
# username
se.set_field_value('#fundAccount', username)
# password
se.fire('#normalpassword', 'focus')
se.show()
html = se.content
soup =  BeautifulSoup(html, "html.parser")
keys = soup.select('tbody > tr > td')
key_list = []
for key in keys:
    key_list.append(key.text)

for i in password:
    m = (key_list.index(i) // 4) + 1
    n = (key_list.index(i) % 4) + 1
Ejemplo n.º 8
0
class GhostMiddleware(object):
    def __init__(self):
        ua = random.choice(self.user_agent_list)  # 随机选择一个User-Agent
        self.ghost = Ghost()
        self.se = Session(self.ghost,
                          display=False,
                          wait_timeout=60,
                          download_images=False)
        super(GhostMiddleware, self).__init__()

    # 通过 Ghost 请求动态网页,代替scrapy的downloader
    def process_request(self, request, spider):
        # self.se.set_proxy(type_='https', host='127.0.0.1', port=1083) # type_根据访问的url手动修改
        self.se.open(request.url)

        print("访问:{0}".format(request.url))

        # 直接返回给spider,而非再传给downloader
        return HtmlResponse(url=request.url,
                            body=self.se.content,
                            encoding="utf-8",
                            request=request)

    def __del__(self):
        self.ghost.exit()

    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
        "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
        "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
        "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
    ]
Ejemplo n.º 9
0
#-*- coding:utf-8 -*-
from ghost import Ghost
from ghost import Session
import time
gh = Ghost()
sessin = Session(gh)
while True:
    try:
        page , resource = sessin.open("http://abcabc.gq")
        sessin.wait_for_page_loaded(10000)
    except:
        pass
    #print sessin.content
    try:
        page , resource = sessin.open("http://abcabc.gq/test.php")
        sessin.wait_for_page_loaded(10000)
    except:
        pass
    #print sessin.content
    try:
        page , resource = sessin.open("http://mxqabc.gq")
        sessin.wait_for_page_loaded(10000)
    except:
        pass
    #print sessin.content
    try:
        page , resource = sessin.open("http://mxqabc.gq/test.php")
        sessin.wait_for_page_loaded(10000)
    except:
        pass
    #print sessin.content
Ejemplo n.º 10
0
item_url = 'http://www.supremenewyork.com/shop/accessories/oi6nqp83m/hsyw4g52m'
checkout_url = 'https://www.supremenewyork.com/checkout'
##############################
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
header = {'User-Agent': ua}
gh = Ghost()
se = Session(gh,
             user_agent=ua,
             wait_timeout=20,
             wait_callback=None,
             display=True,
             viewport_size=(1080, 1680),
             download_images=True)
##############################

se.open(item_url)
se.evaluate("""document.querySelector('input[name="commit"]').click();""")
se.sleep(0.5)
se.open(checkout_url)

ISOFORMAT = '%Y%m%d'
today = datetime.today()
filename = today.strftime(ISOFORMAT)
f = open('supreme' + '/' + filename + '.html', 'w')
f.write(se.content)
f.close()

import code
code.interact(banner="", local=locals())
Ejemplo n.º 11
0
#coding=utf-8
from ghost import Ghost,Session
import urllib

ghost = Ghost()

#url = "http://index.baidu.com/?tpl=trend&word=%B1%E4%D0%CE%BD%F0%B8%D5"
url = "http://piaofang.maoyan.com/movie/246083?_v_=yes"

###### urllib ######
#def getHtml(url):
#    page = urllib.urlopen(url)
#    html = page.read()
#    return html
#
#html = getHtml(url)
#print html

#print page
#print "---" * 30
#print extra_resources

###### Ghost.py ######
with ghost.start():
    session = Session(ghost)
    session.wait_timeout = 999
    page,resource = session.open(url)
    print session.content
    print page.headers, page.url, page.http_status

Ejemplo n.º 12
0
#-*- coding:utf-8 -*-
from ghost import Ghost
from ghost import Session
import time
gh = Ghost()
sessin = Session(gh)
while True:
    try:
        page, resource = sessin.open("http://abcabc.gq")
        sessin.wait_for_page_loaded(10000)
    except:
        pass
    #print sessin.content
    try:
        page, resource = sessin.open("http://abcabc.gq/test.php")
        sessin.wait_for_page_loaded(10000)
    except:
        pass
    #print sessin.content
    try:
        page, resource = sessin.open("http://mxqabc.gq")
        sessin.wait_for_page_loaded(10000)
    except:
        pass
    #print sessin.content
    try:
        page, resource = sessin.open("http://mxqabc.gq/test.php")
        sessin.wait_for_page_loaded(10000)
    except:
        pass
        #print sessin.content
Ejemplo n.º 13
0
from ghost import Ghost, Session

ghost = Ghost()
USERAGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0"

with ghost.start():
    session = Session(ghost, download_images=False, display=True, user_agent=USERAGENT, viewport_size=(800, 600))
    page, rs = session.open("https://m.facebook.com/login.php", timeout=120)
    assert page.http_status == 200

    session.evaluate("""
    document.querySelector('input[name="email"]').value = '*****@*****.**'
    document.querySelector('input[name="pass"]').value = 'wikipedia150101facebook';
    """)

    session.evaluate("""document.querySelector('input[name="login"]').click();""",
                 expect_loading=True)

    """
    import codecs

    with codecs.open('fb.html', encoding='utf-8', mode='w') as f:
       f.write(session.content)
    """

    # session.save_cookies('fbookie')
    session.capture_to(path='fbookie.png')

    # gracefully clean off to avoid errors
    session.webview.setHtml('')
    session.exit()
Ejemplo n.º 14
0
    file_name = './chap/{0}_{1}-{2}.txt'.format(series_name, lower_bound,
                                                upper_bound)
    content = ''.join([i if ord(i) < 128 else ' ' for i in content])
    print(file_name)
    with open(file_name, 'wt', encoding='utf-8') as file:
        file.write(content)


searching = True
story_buffer = ''

next_url = initial_url

while searching:
    try:
        session.open(next_url, timeout=300)
        lower_bound += 1
        session.wait_for_selector(next_button, 60)
    except:
        break

    story_data = session.evaluate(
        'document.querySelector("{0}").innerText;'.format(wrapping_div))
    text = story_data[0]
    story_buffer += str(text)

    if lower_bound > upper_bound:
        save_story(series_name, lower_bound - chunks, upper_bound,
                   story_buffer)
        story_buffer = ''
        lower_bound = upper_bound