Esempio n. 1
0
def crawlerMain(start, over, IPaddress, path, path1, goubanjiaP, max_page):

    # 设置代理
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    proxy.http_proxy = IPaddress
    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)

    print("第" + str(start) + "次循环开始循环:" +
          time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

    #开始循环爬虫
    for i in range(start, over):

        print("开始循环-次数:", i)
        #链接地址
        url = "http://hd.chinatax.gov.cn/guoshui/action/GetArticleView1.do?id=" + str(
            i) + "&flag=1"
        #
        changeIP = saveWebSitePage(driver, url, i)
        while changeIP:
            proxy = webdriver.Proxy()
            proxy.proxy_type = ProxyType.MANUAL
            # 隐式等待5秒,可以自己调节
            driver.implicitly_wait(1)
            #声明全局变量
            global IPList
            #判断ip地址
            if len(IPList):
                # this list is not None
                IPaddress = IPList[0]
                del IPList[0]
            else:
                # this list is None
                IPList.extend(getIPList(path, path1, goubanjiaP, max_page))
                #没有找到ip地址,重新查找
                if not len(IPList):
                    continue
                IPaddress = IPList[0]
                del IPList[0]
            #更换ip地址
            proxy.http_proxy = IPaddress
            print(str(IPaddress) + "正在重新访问-" + url)
            # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
            proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
            driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
            changeIP = saveWebSitePage(driver, url, i)
    print("第" + str(start) + "次循环开始循环:" +
          time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
Esempio n. 2
0
	def set_proxy(self):
		proxy = webdriver.Proxy()
		proxy.proxy_type = ProxyType.MANUAL
		proxy.http_proxy = "127.0.0.1:1080"
		proxy.socks_proxy = "127.0.0.1:1080"
		self.capabilities = webdriver.DesiredCapabilities.CHROME
		proxy.add_to_capabilities(self.capabilities)
Esempio n. 3
0
def get_web(ip_list, url_str):
    if ip_list != []:
        for ip_one in ip_list:
            # 访问
            driver = webdriver.PhantomJS()
            proxy = webdriver.Proxy()
            proxy.proxy_type = 'ProxyType.MANUAL'
            proxy.http_proxy = ip_one
            proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
            driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
            try:
                driver.get(url_str)
            except Exception as e:
                print 'bad ip' + str(e)
                driver.quit()
                continue
            web_scrolltop_number = 0
            for auto_read_times in xrange(0, 5):
                web_scrolltop_number += random.uniform(300, 600)
                time.sleep(random.uniform(2, 3))
                js = "var q=document.documentElement.scrollTop=" + str(
                    web_scrolltop_number)
                driver.execute_script(js)
            time.sleep(random.uniform(2, 4))
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            time.sleep(random.uniform(2, 4))
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            time.sleep(random.uniform(3, 4))
            driver.quit()
            print "success!"
        else:
            print 'ip_list is null!'
Esempio n. 4
0
    def set_proxy(self):
        from selenium.webdriver.common.proxy import ProxyType
        from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

        # spider proxy, and check alive
        pObj = ProxyIPCheck()
        proxyinfo = pObj.proxyIPGet_XiciDaili()
        if len(proxyinfo) < 1:
            print("proxyIPGet failed!!")
            myproxies = ["220.189.249.80:80", "124.248.32.43:80"]
        else:
            myproxies = pObj.validIPGet(proxyinfo)

        user_agents = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            ]
        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        desired_capabilities["phantomjs.page.settings.userAgent"] = (random.choice(user_agents))
        desired_capabilities["phantomjs.page.settings.loadImages"] = False
        proxy = webdriver.Proxy()
        proxy.proxy_type = ProxyType.MANUAL
        proxy.http_proxy = random.choice(myproxies)
        print("Proxy IP: ", proxy.http_proxy)
        proxy.add_to_capabilities(desired_capabilities)
        return desired_capabilities
Esempio n. 5
0
 def crawlData(self, url):
     #设置phantomjs
     desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
     desired_capabilities["phantomjs.page.settings.userAgent"] = (
         config.get_header())
     # 不载入图片,爬页面速度会快很多
     desired_capabilities["phantomjs.page.settings.loadImages"] = False
     # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.MANUAL
     # proxy.http_proxy = random.choice(ips)
     # proxy.add_to_capabilities(desired_capabilities)
     # 打开带配置信息的phantomJS浏览器
     # driver = webdriver.PhantomJS(executable_path=phantomjs_driver,desired_capabilities=desired_capabilities)
     driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
     driver.start_session(desired_capabilities)
     # 隐式等待5秒,可以自己调节
     driver.implicitly_wait(5)
     # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项
     # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
     driver.set_page_load_timeout(20)
     # 设置10秒脚本超时时间
     driver.set_script_timeout(20)
     #browser = webdriver.Chrome('/home/caidong/developProgram/selenium/chromedriver')
     driver.get(url)
     driver.implicitly_wait(1)
     driver.find_element_by_xpath(
         '//div[@class="house-chat-phone"]').click()
     html = driver.page_source
     return html
def getdaxiangdailiIP():
    driver = webdriver.PhantomJS(
        executable_path=r'C:\Users\wangquan\phantomjs\bin\phantomjs.exe')
    # 设置页面加载超时
    daxiangurl="http://tvp.daxiangdaili.com/ip/?tid=556249540865397&num=1&protocol=http"
    driver.set_page_load_timeout(5)
    # 还原系统代理
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.DIRECT
    # 代理ip地址
    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    time.sleep(1.5)
    Retuenip=""
    while True:
        driver.get(daxiangurl)
        Retuenip=filter_tags(driver.page_source)
        if ip_exist(Retuenip):
            print("正确获取ip地址-开始爬虫:"+Retuenip)
            break
        else:
            print("获取ip地址失败正在重新获取")
        time.sleep(1.5)
    #返回IP地址
    return Retuenip
Esempio n. 7
0
 def send_mail(self, data, proxie):
     username = data[1]
     email = data[2].split('@')[0] + '@027168.com'
     password = data[3]
     # 创造一个浏览器
     browser1 = webdriver.PhantomJS(
         executable_path=r"/usr/local/bin/phantomjs", )  # //内核 webkkit
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.MANUAL
     proxy.http_proxy = proxie
     # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
     proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
     browser1.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
     browser1.get('https://pngtree.com/')
     # 点击注册按钮
     browser1.find_element_by_id('base-public-register-button').click()
     # print browser.page_source
     browser1.find_element_by_id(
         'base-public-login-username-regiser-text').send_keys(username)
     browser1.find_element_by_id(
         'base-public-login-email-regiser-text').send_keys(email)
     browser1.find_element_by_id(
         'base-public-login-password-regiser-text').send_keys(password)
     browser1.find_element_by_id('base-sub-regist-Btn').click()
     time.sleep(10)
     browser1.save_screenshot('regist.png')
     browser1.quit()
def goubanjiaIP(Pagesize,IPaddress):
    # 全网代理ip地址网址数组
    goubanjia = ['http://www.goubanjia.com/free/gngn/', 'http://www.goubanjia.com/free/gnpt/']
    print("开始从《全网代理ip》----中获取免费的代理ip地址")

    #使用chromedriver模拟页面打开
    chromedriver = r"C:\Users\wangquan\chromedriver\chromedriver.exe"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver1 = webdriver.Chrome(chromedriver)
    #用于存储格式化之后的ip地址
    p_pool = []
    #设置窗体大小
    driver1.set_window_size(0, 0)
    #设置窗体的位置
    driver1.set_window_position(-200,-200)
    # 设置系统代理
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    # 代理ip地址
    proxy.http_proxy = IPaddress
    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    driver1.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    for urlItem in goubanjia:
        Haserror=1
        opt = 1
        while opt <=Pagesize:
            #地址
            url=urlItem+"index"+str(opt)+".shtml"
            try:
                # 设置爬虫页面超时
                driver1.set_page_load_timeout(5)
                #访问目标地址
                driver1.get(url)
                time.sleep(0.5)
                #获取页面信息
                pageSearch=driver1.page_source
                #对页面数据进行转码
                bobj_2 = BeautifulSoup(pageSearch, "lxml")
                #获取指定标签数据
                sibs = bobj_2.findAll("td", {"class", "ip"})
                #开始解析页面
                for child in sibs:
                    # 去除html标签,并且判断是否是IP地址
                    if ('.' in filter_tags(replicFinsh(str(child),'none'))) and (':' in filter_tags(replicFinsh(str(child),'none'))):
                        #添加ip地址到池中
                        p_pool.append(filter_tags(replicFinsh(str(child), 'none')))
                        #print(filter_tags(replicFinsh(str(child), 'none')))
                opt=opt+1
            except Exception as e:
                Haserror=Haserror+1
                #页面等待,之后再次访问页面来获取数据
                time.sleep(random.randint(1, 6) * 0.1)
                #此次页面获取失败
                if Haserror==3:
                    opt=1000000

    driver1.close()
    driver1.quit()
    return p_pool
Esempio n. 9
0
    def get_desired_capabilities(self, spider=None):
        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        # 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器
        desired_capabilities["phantomjs.page.settings.userAgent"] = (random.choice(self.user_agent_list))
        # 不载入图片,爬页面速度会快很多
        desired_capabilities["phantomjs.page.settings.loadImages"] = False
        # desired_capabilities["phantomjs.page.settings.resourceTimeout"] = 15000
        # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url
        # headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
        #            'Accept - Encoding': 'gzip, deflate, sdch',
        #            'Accept-Language': 'zh-CN,zh;q=0.8',  # zh-CN,zh;q=0.8   en-US,en;q=0.8
        #            'Cache-Control': 'max-age=0',
        #            'Connection': 'keep-alive'}

        # for key, value in headers.iteritems():
        #     desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
        if not spider:
            return desired_capabilities
        if not hasattr(spider, "taskJob"):
            return desired_capabilities
        taskJob = spider.taskJob
        if taskJob and taskJob.proxyId:
            if taskJob.proxyType and taskJob.proxyType == ProxyType.DYNAMIC_FOR_URL:
                data = ProxyDao.contentDetail(taskJob.proxyId)
                proxyInfo = data['result']['proxy']
                time.sleep(4)
                proxy_ips = getProxyList(proxyInfo.authUrl)
                proxy = webdriver.Proxy()
                proxy.proxy_type = Type.MANUAL
                proxy.http_proxy = random.choice(proxy_ips)
                proxy.add_to_capabilities(desired_capabilities)
        return desired_capabilities
Esempio n. 10
0
 def process_request(self, request, spider):
     agent = choice(AGENTS)
     request.headers['User-Agent'] = agent
     if agent:
         # 这里填写无忧代理IP提供的API订单号(请到用户中心获取)
         order = "d168f83eca5a334b2e30fa051bf424f0";
         # 获取IP的API接口
         apiUrl = "http://api.ip.data5u.com/dynamic/get.html?order=" + str(order)+'&sep=3';
         # 获取IP列表
         res = urllib.urlopen(apiUrl).read().strip("\n");
         # 按照\n分割获取到的IP
         ips = res.split("\n");
         print('proxy is working ip:'+str(ips[0]))
         # driver = webdriver.PhantomJS(executable_path="D:\Python27\Tools\phantomjs-2.1.1-windows/bin\phantomjs.exe",service_args=['--load-images=false','--disk-cache=true','--proxy={}'.format(ips[0]), '--proxy-type=socks5']) #指定使用的浏览器
         # #wait = WebDriverWait(driver, 10)
         # # driver = webdriver.Firefox()
         # driver.get(request.url)
         # time.sleep(70)
         # #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#j-product-desc > div.ui-box.product-property-main > div.ui-box-title')))
         # js = "var q=document.documentElement.scrollTop=10000"
         # driver.execute_script(js) #可执行js,模仿用户操作。此处为将页面拉至最底端。
         # body = driver.page_source
         # print(body)
         chromeOptions = webdriver.ChromeOptions()# 设置代理
         chromeOptions.add_argument("--proxy-server =http://{}".format(ips[0]))
         webdriver.Proxy()
         browser = webdriver.Chrome(executable_path='D:\Python27\Tools\chromedriver_win32/chromedriver.exe',chrome_options = chromeOptions)
         #wait = WebDriverWait(browser, 10)
         browser.get(request.url)
         body = browser.page_source
         print(body)
         browser.set_window_size(1400, 900)
         return HtmlResponse(request.url, body=body, encoding='utf-8', request=request)
Esempio n. 11
0
 def set_proxy(self):
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.MANUAL
     proxy.http_proxy = random.choice(HTTP_IPS)
     # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
     proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
     self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
Esempio n. 12
0
def dynamic_change_proxy():
    browser = webdriver.Chrome(
        executable_path=r'C:\Users\chenjinwei\source\chromedriver.exe')
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    proxys = getproxy()
    print(proxys)
    proxy.http_proxy = proxys

    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    browser.get('http://httpbin.org/ip')
    print('1: ', browser.session_id)
    print('2: ', browser.page_source)
    print('3: ', browser.get_cookies())
    time.sleep(20)
    proxys = getproxy()
    print(proxys)
    proxy.http_proxy = proxys
    print('second time workking')
    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    browser.get('http://httpbin.org/ip')
    print('1: ', browser.session_id)
    print('2: ', browser.page_source)
    print('3: ', browser.get_cookies())
def dynamic_load(url):
    desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
    # 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器
    desired_capabilities[
        "phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0"
    # 不载入图片,爬页面速度会快很多
    desired_capabilities["phantomjs.page.settings.loadImages"] = False
    # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    proxy.http_proxy = random.choice(redis_conn1())
    proxy.add_to_capabilities(desired_capabilities)
    # 打开带配置信息的phantomJS浏览器
    driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
    # driver = webdriver.PhantomJS()
    # driver.start_session(desired_capabilities)
    # 隐式等待5秒,可以自己调节
    driver.implicitly_wait(5)
    # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项
    # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
    driver.set_page_load_timeout(100)
    # 设置10秒脚本超时时间
    driver.set_script_timeout(100)

    driver.get(url)
    # next_page=driver.find_element_by_id (idd)#.get_attribute('href')
    # driver.get(next_page)
    # next_page
    # html = BeautifulSoup(driver.page_source, 'xml').prettify()
    print driver.page_source
Esempio n. 14
0
def open_browser(proxy=None, download=None):
    #profile = webdriver.FirefoxProfile(r'/home/xling/.mozilla/firefox/nw3oghgt.auto/')
    profile = webdriver.FirefoxProfile(
        r'/home/xling/.mozilla/firefox/j0sto346.auto/')
    profile.native_events_enabled = True
    if download:
        profile.set_preference("browser.download.dir", download)


#	profile.set_preference("browser.download.useDownloadDir", "true");
#	profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain, application/vnd.ms-excel, text/csv, text/comma-separated-values, application/octet-stream")
    if proxy is not None:
        raw = {
            'proxyType': {
                'ff_value': 1,
                'string': 'manual'
            },
            'httpProxy': proxy,
            'sslProxy': proxy
        }
        proxy = webdriver.Proxy(raw)
        profile.set_proxy(proxy)

    #browser = webdriver.Remote("http://localhost:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNIT.copy())
    browser = webdriver.Firefox(profile,
                                executable_path="/usr/bin/geckodriver")
    #brwser.implicitly_wait(60)
    return browser
Esempio n. 15
0
def get_phantomjs_driver(strategy):
    cap = webdriver.DesiredCapabilities.PHANTOMJS
    cap['phantomjs.page.settings.resourceTimeout'] = '60000'
    cap['phantomjs.page.settings.loadImages'] = True
    cap['phantomjs.page.settings.userAgent'] = \
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/538.1 "\
        "(KHTML, like Gecko) Safari/538.1"
    if strategy.get('user-agent'):
        cap['phantomjs.page.settings.userAgent'] = strategy.get('user-agent')
    if strategy.get('referer'):
        # logger.debug('add referer: %s' % strategy.get('referer'))
        cap["phantomjs.page.customHeaders.Referer"] = strategy.get('referer')
    if strategy.get('cookie'):
        # logger.debug('use cookie {}'.format(strategy.get('cookie')))
        cap["phantomjs.page.customHeaders.Cookie"] = strategy.get('cookie')
    proxy = webdriver.Proxy()
    if strategy.get('proxy'):
        # logger.info('get proxy from {}'.format(cf.proxy_url))
        proxy_url = requests.get(cf.proxy_url).text
        # logger.debug('use proxy {}'.format(proxy_url))
        if not proxy_url:
            raise Exception('fetch proxy empty, sleep 3s to retry')
        proxy.proxy_type = webdriver.common.proxy.ProxyType.MANUAL
        proxy.http_proxy = proxy_url
    else:
        proxy.proxy_type = webdriver.common.proxy.ProxyType.SYSTEM
    proxy.add_to_capabilities(cap)
    driver = webdriver.PhantomJS(executable_path='./phantomjs')
    driver.implicitly_wait(60)
    driver.set_page_load_timeout(60)
    #driver.set_window_size(4096, 2160)
    return driver
Esempio n. 16
0
def browser_get(url, http_proxy):
    count = len(headers_list)
    index = random.randint(0, count - 1)
    headers = headers_list[index]
    for key, value in headers.items():
        if key != 'User-Agent':
            webdriver.DesiredCapabilities.PHANTOMJS[
                'phantomjs.page.customHeaders.{}'.format(key)] = value

    webdriver.DesiredCapabilities.PHANTOMJS[
        'phantomjs.page.settings.userAgent'] = headers['User-Agent']

    driver = webdriver.PhantomJS(executable_path=browser_path)
    driver.implicitly_wait(10)
    driver.set_page_load_timeout(10)

    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    proxy.http_proxy = http_proxy
    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    log.info('starting open {}'.format(url))
    driver.get(url)
    # log.info('0: {}'.format(url))
    log.info('1: {}'.format(driver.session_id))
    # log.info('2: {}'.format(driver.page_source))
    print('2:', driver.page_source)
Esempio n. 17
0
 def selenium_proxy(self):
     """
     Returns a Selenium WebDriver Proxy class with details of the HTTP Proxy
     """
     return webdriver.Proxy({
         "httpProxy": self.proxy(),
         "sslProxy": self.proxy(),
     })
Esempio n. 18
0
 def setProxy(self, proxyStr):
     # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.MANUAL
     proxy.http_proxy = proxyStr
     # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
     proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
     self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
Esempio n. 19
0
 def use_proxy(self, url):
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.MANUAL
     proxy.http_proxy = url
     proxy.ssl_proxy = url
     # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
     proxy.add_to_capabilities(self.dcap)
     return self
Esempio n. 20
0
def getXiCiIP(PageSize, IPaddress):

    driver = webdriver.PhantomJS(
        executable_path=r'C:\Users\wangquan\phantomjs\bin\phantomjs.exe')
    # 设置系统代理
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    # 代理ip地址
    proxy.http_proxy = IPaddress
    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    #设置爬虫页面超时
    driver.set_page_load_timeout(5)
    #从西次进行爬虫
    p_pool = []
    xici_page = 1
    while xici_page <= PageSize:

        new_count = 0
        xici_url = 'http://www.xicidaili.com//wt/' + str(xici_page)
        try:
            max_wait = 5  # 20s
            driver.set_page_load_timeout(max_wait)
            driver.set_script_timeout(max_wait)
            driver.get(xici_url)
            bobj_2 = BeautifulSoup(driver.page_source, "lxml")
            sibs = bobj_2.findAll('table',
                                  {'id': 'ip_list'})[0].tr.next_siblings
        except Exception as e:
            try:
                print('error 1:', e)
                max_wait = 5  # 20s
                driver.set_page_load_timeout(max_wait)
                driver.set_script_timeout(max_wait)
                driver.get(xici_url)
                # 等待时长6秒,默认0.5秒询问一次
                WebDriverWait(driver, 6)
                bobj_2 = BeautifulSoup(driver.page_source, "lxml")
                sibs = bobj_2.findAll('table',
                                      {'id': 'ip_list'})[0].tr.next_siblings
            except Exception as e:
                print('error 2', e)
                break
        for sib in sibs:
            try:
                #拼接ip地址
                get_proxy = sib.findAll('td')[1].get_text(
                ) + ':' + sib.findAll('td')[2].get_text()
                p_pool.append(get_proxy)
                new_count += 1
            except Exception as e:
                print('error 2', e)
                break
        xici_page += 1
        # 第几个分页面
    return p_pool
Esempio n. 21
0
    def __init__(self, proxy=None):
        """init the webdriver by setting the proxy and user-agent

        Args:
            proxy (str): proxy in the form of ip:port
        """
        self.amazon_index = r'https://www.amazon.com/'
        self.libPath = os.path.abspath("./lib/geckodriver64.exe")
        if proxy == None:
            self.driver = webdriver.Firefox(executable_path=self.libPath)
            # self.driver = webdriver.Chrome(executable_path=self.libPath)
            # self.driver = webdriver.PhantomJS(executable_path=self.libPath)
            self.proxy = ""
        else:
            self.proxy = proxy
            ip, port = proxy.split(':')
            profile = webdriver.FirefoxProfile()
            profile.set_preference("network.proxy.type", 1)
            profile.set_preference("network.proxy.http", ip)
            profile.set_preference("network.proxy.http_port", int(port))
            profile.set_preference("network.proxy.ssl", ip)
            profile.set_preference("network.proxy.ssl_port", int(port))
            profile.set_preference("browser.tabs.remote.autostart.2", False)

            profile.set_preference('permissions.default.image', 2)
            profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')

            # for auth
            # profile.add_extension(self.auth_proxy_path)
            # credentials = 'User-002:fcg1994'
            # credentials = b64encode(credentials.encode('ascii')).decode('utf-8')
            # profile.set_preference('extensions.closeproxyauth.authtoken', credentials)
            # profile.set_preference('permissions.default.image', 2)
            # profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
            # # save identify code
            # profile.set_preference('browser.download.folderList', 2)
            # profile.set_preference('browser.download.manager.showWhenStarting', False)
            # profile.set_preference('browser.download.dir','./verifyCode/images')
            # profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'image/jpeg')

            # set user_agent
            # profile.set_preference("general.useragent.override", generate_user_agent())
            profile.update_preferences()
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--proxy-server=%s' % proxy)

            proxy_driver = webdriver.Proxy()
            proxy_driver.proxy_type = ProxyType.MANUAL
            proxy_driver.http_proxy = proxy
            # self.driver = webdriver.PhantomJS(self.libPath)
            # proxy_driver.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
            # self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
            self.driver = webdriver.Firefox(executable_path=self.libPath, firefox_profile=profile)
            # self.driver = webdriver.Chrome(executable_path=self.libPath,chrome_options=chrome_options)

            log_info('current proxy: %s' % proxy)
Esempio n. 22
0
def build_browser(agent, http_proxy):
    desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
    desired_capabilities["phantomjs.page.settings.userAgent"] = agent
    desired_capabilities["phantomjs.page.settings.loadImages"] = False

    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL

    proxy.http_proxy = http_proxy
    proxy.add_to_capabilities(desired_capabilities)
    return driver, desired_capabilities
Esempio n. 23
0
    def parse(self, response):
        service_args = []
        service_args.append('--load-images=no')  ##关闭图片加载
        service_args.append('--disk-cache=yes')  ##开启缓存
        service_args.append('--ignore-ssl-errors=true')  ##忽略https错误
        browser = webdriver.PhantomJS(service_args=service_args)

        count = 0
        while 1 == 1:
            print(count)
            proxyIp = requests.get(
                "http://tvp.daxiangdaili.com/ip/?tid=557895172920514&num=1&filter=on"
            ).content
            # thisip = str(IPPools.get_proxy(), encoding="utf-8")
            thisip = str(proxyIp, encoding="utf-8")

            try:
                #telnetlib.Telnet('127.0.0.1', port='80', timeout=20)

                requests.get(
                    'http://vote1.qblife.com.cn/vote24/survey/7?from=timeline&isappinstalled=0',
                    proxies={"http": "http://" + thisip},
                    timeout=1)
            except:
                print('connect failed')
            else:
                print('success')
                try:
                    proxy = webdriver.Proxy()
                    proxy.proxy_type = ProxyType.MANUAL
                    proxy.http_proxy = thisip
                    proxy.add_to_capabilities(
                        webdriver.DesiredCapabilities.PHANTOMJS)
                    browser.start_session(
                        webdriver.DesiredCapabilities.PHANTOMJS)
                    browser.get(
                        "http://vote1.qblife.com.cn/vote24/survey/7?from=timeline&isappinstalled=0"
                    )
                    browser.set_page_load_timeout(3)

                    #print(browser.page_source)
                    elem = browser.find_element_by_id("loadmore")
                    elem.click()
                    elem = browser.find_element_by_id("vote-btn-178")
                    elem.click()

                except:
                    print('获取不到元素')
                else:
                    num = browser.find_element_by_id("vote-num-178").text

                    print('点赞成功' + num)

                    count = count + 1
Esempio n. 24
0
 def _create_driver(self):
     driver = new_driver(user_agent=USER_AGENT,
                         js_re_ignore='/PicCheckCode1/g')
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.DIRECT
     proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
     driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
     # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
     driver.set_page_load_timeout(13)
     # 设置10秒脚本超时时间
     driver.set_script_timeout(13)
     return driver
Esempio n. 25
0
 def check(self):
     driver = webdriver.PhantomJS()
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.MANUAL
     proxy.http_proxy = '1.9.171.51:800'
     ips = self.allip()
     i = random.randint(0, len(ips))
     endport = ips[i]
     proxy.http_proxy = endport.ip + ':' + endport.port
     driver.get("http://www.baidu.com")
     soup = BeautifulSoup(driver.page_source, 'lxml')
     title = soup.find_all('title')
     print(title)
Esempio n. 26
0
 def driver(self):
     """create a browser"""
     if self.headless == True:
         options = webdriver.FirefoxOptions()
         options.set_headless()
         # options=None
         options.add_argument('headless')
         options.add_argument('--disable-gpu')
         if self.proxies:
             proxy = Proxy({
                 'proxyType': ProxyType.MANUAL,
                 'httpProxy': self.proxy()  # 代理ip和端口
             })
             browser_driver = webdriver.Firefox(firefox_options=options,
                                                proxy=proxy)
         else:
             browser_driver = webdriver.Firefox(firefox_options=options)
     elif self.headless == "PhantomJS":
         desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
         desired_capabilities["phantomjs.page.settings.userAgent"] = ua()
         desired_capabilities["phantomjs.page.settings.loadImages"] = False
         if self.proxies:
             proxy = webdriver.Proxy()
             proxy.proxy_type = ProxyType.MANUAL
             proxy.http_proxy = self.proxy()
             proxy.add_to_capabilities(desired_capabilities)
             browser_driver = webdriver.PhantomJS(
                 executable_path=self.phantomjs_driver_path,
                 desired_capabilities=desired_capabilities,
                 service_args=[
                     '--ignore-ssl-errors=true', "--cookies-file=cookie.txt"
                 ])
         else:
             browser_driver = webdriver.PhantomJS(
                 executable_path=self.phantomjs_driver_path,
                 desired_capabilities=desired_capabilities,
                 service_args=[
                     '--ignore-ssl-errors=true', "--cookies-file=cookie.txt"
                 ])
     else:
         if self.proxies:
             proxy = Proxy({
                 'proxyType': ProxyType.MANUAL,
                 'httpProxy': self.proxy()  # 代理ip和端口
             })
             browser_driver = webdriver.Firefox(proxy=proxy)
         else:
             browser_driver = webdriver.Firefox()
     browser_driver.set_page_load_timeout(self.timeout)
     browser_driver.set_script_timeout(self.timeout)
     return browser_driver
Esempio n. 27
0
    def login_with_cookies(self,
                           login_url,
                           cookies_data,
                           domain,
                           browser='foxfire'):
        if browser == 'foxfire':  # 选择火狐浏览器
            profile = webdriver.FirefoxProfile()  # 火狐的配置文件类
            if self.proxy is not None:  # 判断是否使用代理,如果使用则获取ip和端口
                ip = self.proxy.split(':')[0]
                port = self.proxy.split(':')[1]
                profile.set_preference("network.proxy.type", 1)
                profile.set_preference("network.proxy.http",
                                       ip)  # 默认代理方式为http,可以修改
                profile.set_preference("network.proxy.http_port", port)
            driver = webdriver.Firefox(executable_path=self.firefox_path,
                                       firefox_profile=profile)

        elif browser == 'chrome':  # 选择谷歌浏览器
            options = webdriver.ChromeOptions()  # 谷歌浏览器的配置选项类
            if self.proxy is not None:  # 判断是否使用代理
                options.add_argument('--proxy-server=http://' + self.proxy)
            driver = webdriver.Chrome(executable_path=self.chrome_path,
                                      chrome_options=options)

        elif browser == 'phantomjs':  # 选择phantomjs浏览器
            desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
            if self.proxy is not None:
                proxy = webdriver.Proxy()
                proxy.proxy_type = ProxyType.MANUAL
                proxy.http_proxy = self.proxy
                proxy.add_to_capabilities(desired_capabilities)
            desired_capabilities[
                "phantomjs.page.settings.loadImages"] = False  # 禁止加载图片,可以提高速度
            driver = webdriver.PhantomJS(
                executable_path=self.phantomjs_path,
                desired_capabilities=desired_capabilities)
        else:
            print u'浏览器类型不存在'
            return None
        driver.get(login_url)
        # 添加cookies
        driver.delete_all_cookies()
        for cookie in cookies_data.items():
            driver.add_cookie({
                'domain': domain,
                'name': cookie[0],
                'value': cookie[1],
                'path': '/',
                'expires': None
            })
        return driver
def get_data(shop_id):
    url = 'http://www.dianping.com/shop/6232395'
    # load PhantomJS
    driver = webdriver.PhantomJS()
    proxy_list = redis_conn1()
    if proxy_list:
        print proxy_list
        desired_capabilities = webdriver.DesiredCapabilities.PHANTOMJS.copy()
        desired_capabilities[
            "phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0"
        # desired_capabilities[
        #     "phantomjs.page.settings.cookies"] = ''' _hc.v="\"0f6e7827-bc24-4e37-a02d-22712123f3b9.1487061681\""; cy=2; cye=beijing; __utma=1.2016944627.1503913333.1508380646.1508410598.4; __utmz=1.1508380646.3.3.utmcsr=dianping.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __mta=218390182.1508410622795.1508410622795.1508411087614.2; s_ViewType=10; aburl=1; _lxsdk_cuid=15f34478accc8-0036528b7bf6dd-49576f-13c680-15f34478accc8; _lxsdk=15f34478accc8-0036528b7bf6dd-49576f-13c680-15f34478accc8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; JSESSIONID=B6B683D3F115418041ED0AFCE0971147; _lxsdk_s=15f77681817-db4-bf6-14e%7C%7C13'''

        proxy = webdriver.Proxy()
        proxy.proxy_type = ProxyType.MANUAL
        proxy.http_proxy = random.choice(proxy_list)
        # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
        proxy.add_to_capabilities(desired_capabilities)
        driver.start_session(desired_capabilities)
        driver.add_cookie({
            'path': '/',
            'name': 'JSESSIONID',
            'value': 'B6B683D3F115418041ED0AFCE0971147;',
            'domain': '.dianping.com'
        })
        # for key,value in cookies.items():
        #     driver.add_cookie({
        #         'name': key,
        #         'value': value,
        #         'path': '/',
        #         'domain': '.dianping.com'
        #     })
        # print driver.get_cookies()
    print driver.desired_capabilities
    driver.set_page_load_timeout(5)
    driver.get(url)
    print driver.get_cookies()
    # start Scrollbar
    js1 = 'return document.body.scrollHeight'
    js2 = 'window.scrollTo(0, document.body.scrollHeight)'
    old_scroll_height = 0
    while (driver.execute_script(js1) > old_scroll_height):
        old_scroll_height = driver.execute_script(js1)
        driver.execute_script(js2)
        time.sleep(3)
    # get url  by xpath
    print driver.page_source
    list1 = driver.find_elements_by_xpath(
        '//div[@class="comment-condition J-comment-condition Fix"]/div/span/a')
    for l in list1:
        print l.text
Esempio n. 29
0
def main():
    # browser = webdriver.PhantomJS()   # Be OK in command line, but not in PyCharm.
    # browser = webdriver.PhantomJS(r"/home/lxw/Downloads/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs")
    browser = webdriver.Chrome(
        r"/home/lxw/Software/chromedirver_selenium/chromedriver")  # OK
    browser.get("http://ipecho.net/plain")
    print('session_id: ', browser.session_id)
    print('page_source: ', browser.page_source)
    print('cookie: ', browser.get_cookies())
    print("----" * 10, "\n")

    # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    # req = requests.get("http://datazhiyuan.com:60001/plain", timeout=10)
    req = requests.get("http://localhost:60001/plain", timeout=10)
    print("Get an IP proxy:", req.text)
    if req.text:
        proxy.http_proxy = req.text  # '1.9.171.51:800'
    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    browser.get("http://ipecho.net/plain")
    print('session_id: ', browser.session_id)
    print('page_source: ', browser.page_source)
    print('cookie: ', browser.get_cookies())
    print("----" * 10, "\n")

    # 还原为系统代理
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.DIRECT
    proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    browser.get("http://ipecho.net/plain")
    print('session_id: ', browser.session_id)
    print('page_source: ', browser.page_source)
    print('cookie: ', browser.get_cookies())
    print("----" * 10, "\n")
Esempio n. 30
0
 def _create_driver(self):
     driver = new_driver(user_agent=USER_AGENT,
                         js_re_ignore='/cdwsjb\/CaptchaImg.png/g')
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.DIRECT
     proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
     driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
     # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
     driver.set_page_load_timeout(13)
     # 设置10秒脚本超时时间
     driver.set_script_timeout(13)
     # 随便访问一个相同host的地址,方便之后设置cookie
     driver.get('https://gr.cdhrss.gov.cn:442/xxxx')
     return driver