def getProxy(renew=False): global pIPs, globalProxyCount, pipObj # while 1: try: # count = 100 if len(pIPs) < minPIPCount: # 代理ip太少,重新获取 pIPs = getAvailableIPs() globalProxyCount = globalProxyCount + 1 if globalProxyCount % 100 == 0 or renew: pipObj = random.choice(pIPs) print 'globalProxyCount:', str( globalProxyCount), ' change proxyIp to ', str(pipObj) pIPs.remove(pipObj) globalProxyCount = 0 # randomPIpIndex = random.randint(0, len(pIPs) - 1) # pipObj = pIPs[randomPIpIndex] pIp = pipObj[0] pPort = pipObj[1] # del pIPs[randomPIpIndex] # pIPs.remove(pipObj) # 删除ip # deletByIP(pIp) proxy = { 'http': 'http://%s:%s' % (pIp, pPort), 'https': 'http://%s:%s' % (pIp, pPort) } return proxy except Exception as e: print 'get proxy exception: ', e
def getNewBrowserArgs(): pIPs = getAvailableIPs() pipObj = random.choice(pIPs) # randomPIpIndex = random.randint(0, len(pIPs)) # pipObj = pIPs[randomPIpIndex] pIp = pipObj[0] pPort = pipObj[1] # ua = random.choice(USER_AGENTS) # caps["phantomjs.page.settings.userAgent"] = ua # proxy = webdriver.Proxy() # proxy.proxy_type = ProxyType.MANUAL # proxy.http_proxy = pIp + ':' + str(pPort) # # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 # proxy.add_to_capabilities(caps) # driver.start_session(caps) # if driver: # try: # # driver.close() # driver.quit() # except Exception as er: # print er caps = webdriver.DesiredCapabilities.PHANTOMJS ua = random.choice(USER_AGENTS) caps["phantomjs.page.settings.userAgent"] = ua service_args = [ '--proxy=' + pIp + ':' + str(pPort), '--proxy-type=http', ] return service_args, caps
def reflashProxy(caps, driver, pIPs): if len(pIPs) < minPIPCount: # 代理ip太少,重新获取 pIPs = getAvailableIPs() # pipObj = random.choice(pIPs) randomPIpIndex = random.randint(0, len(pIPs)) pipObj = pIPs[randomPIpIndex] pIp = pipObj[0] pPort = pipObj[1] ua = random.choice(USER_AGENTS) caps["phantomjs.page.settings.userAgent"] = ua proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = pIp + ':' + str(pPort) # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(caps) driver.start_session(caps) return pIPs, pIp, randomPIpIndex
# t = int(sys.argv[2]) # qichachaFromIndustry(f,t) #从投资接口开始 # fromInvestInt() #搜索页面 # while 1: # for length in range(10,11): # try: # startFromSearch(length) # except Exception as e: # print 'job fail, e:',traceback.format_exc() # 页面推荐入口 pIPs = getAvailableIPs() while 1: try: count = 100 if len(pIPs) < minPIPCount: # 代理ip太少,重新获取 pIPs = getAvailableIPs() pipObj = random.choice(pIPs) # randomPIpIndex = random.randint(0, len(pIPs) - 1) # pipObj = pIPs[randomPIpIndex] pIp = pipObj[0] pPort = pipObj[1] # del pIPs[randomPIpIndex] pIPs.remove(pipObj)
def tradMarkTestById(f, t): # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # driver = webdriver.PhantomJS(executable_path=phantomPath,service_args=service_args) # caps = webdriver.DesiredCapabilities.PHANTOMJS ua = random.choice(USER_AGENTS) # caps["phantomjs.page.settings.userAgent"] = ua pIPs = getAvailableIPs() print 'start with ' + str(len(pIPs)) + ' proxy ips' # startWithDriver(driver, f, t) baseUrl = 'http://202.108.90.73/txnS03.do' count = 0 startTime = time.time() lastCountTime = time.time() # pIPs, pIp, randomPIpIndex = reflashProxy(caps, driver, pIPs) pipOk = False maxCountPerProxy = 5 nowCount = 0 driver = None noNeedReStart = False for category in range(f, t + 1): if not pipOk or nowCount > maxCountPerProxy: # pIPs, pIp, randomPIpIndex = reflashProxy(caps, driver, pIPs) if len(pIPs) < minPIPCount: # 代理ip太少,重新获取 pIPs = getAvailableIPs() pipObj = random.choice(pIPs) # randomPIpIndex = random.randint(0, len(pIPs)) # pipObj = pIPs[randomPIpIndex] pIp = pipObj[0] pPort = pipObj[1] # ua = random.choice(USER_AGENTS) # caps["phantomjs.page.settings.userAgent"] = ua # proxy = webdriver.Proxy() # proxy.proxy_type = ProxyType.MANUAL # proxy.http_proxy = pIp + ':' + str(pPort) # # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 # proxy.add_to_capabilities(caps) # driver.start_session(caps) if driver: try: # driver.close() driver.quit() except Exception as er: print er caps = webdriver.DesiredCapabilities.PHANTOMJS ua = random.choice(USER_AGENTS) caps["phantomjs.page.settings.userAgent"] = ua service_args = [ '--proxy=' + pIp + ':' + str(pPort), '--proxy-type=http', ] noNeedReStart = False nowCount = 0 else: nowCount = nowCount + 1 try: if not noNeedReStart: driver = webdriver.PhantomJS(executable_path=phantomPath, service_args=service_args, desired_capabilities=caps) driver.set_page_load_timeout(30) # driver.set_window_size(1366,768) driver.get(baseUrl) else: driver.refresh() pipOk = True catInputTag = driver.find_element_by_css_selector( '.inputbox input') catInputTag.send_keys(category) # nameInputTag = driver.find_element_by_css_selector('#mn') # nameInputTag.send_keys(word) submitTag = driver.find_element_by_css_selector('#_searchButton') submitTag.click() # print 'before submit url', driver.current_url time.sleep(0.1) windowsHandler = driver.window_handles if len(windowsHandler) < 2: print 'not open result page, skip' noNeedReStart = True continue driver.switch_to.window(windowsHandler[1]) # print 'after submit switch to new tab, url', driver.current_url list_box = driver.find_element_by_css_selector('.list_box') if not list_box: print 'no result list, content: ', driver.page_source noNeedReStart = True continue resList = list_box.find_elements_by_css_selector('tr') resultLength = len(resList) print 'result count:', resultLength - 1 # if resultLength == 2: # print driver.page_source for resultIndx in range(1, resultLength): resTr = resList[resultIndx] for link in resTr.find_elements_by_css_selector('a'): link.click() windowsHandler = driver.window_handles if len(windowsHandler) < 2: print 'not open result page, skip' noNeedReStart = True continue driver.switch_to.window(windowsHandler[2]) # linkUr = urlparse.urljoin(baseUrl, link['href']) print 'walking result urls , now :', driver.current_url # driver.get(linkUr) # print 'deatail page length: ', len(driver.page_source) # datas = [] # for info in driver.find_elements_by_css_selector('.info'): # if(len(info.text) < 20): # datas.append(info.text) # print 'datas:',repr(datas).decode("unicode-escape") count = count + 1 if count % 10 == 0: spentTime = time.time() - startTime print 'count: ', count, ' took ', spentTime, ' seconds, avg_per_sec: ', count / spentTime, \ ' last 10 spent ', (time.time() - lastCountTime), ' secs' lastCountTime = time.time() # print 'close current detail windows' # driver.close() try: driver.close() # driver.quit() except Exception as er: print er print 'switch to results page' driver.switch_to.window(windowsHandler[1]) try: driver.close() # driver.quit() except Exception as er: print er break # driver.quit() # driver.close() # windowsHandler = driver.window_handles # randomTime = random.randint(500, ) # time.sleep(randomTime / 100) driver.switch_to.window(windowsHandler[0]) # driver.close() # driver.quit() # try: # driver.close() # # driver.quit() # except Exception as er: # print er # try: # # driver.close() # driver.quit() # except Exception as er: # print er noNeedReStart = True print 'finish categoty:', category, ' reflash search page' except Exception as e: print 'id: ', category, ' error:', e print 'proxy ip:', pIp, ' port:', pPort # randomTime = random.randint(100, 500) # time.sleep(randomTime / 100) # print 'source: ', driver.page_source # driver.quit() try: driver.close() # driver.quit() except Exception as er: print er try: # driver.close() driver.quit() except Exception as er: print er noNeedReStart = False # del pIPs[randomPIpIndex] pIPs.remove(pipObj) # deletByIP(pIp) pipOk = False