def get_page(self, url, pre_url, mutex, arg_list, min_len=40): while True: try: # print u'开始加载网页...' if len(pre_url) > 0: for each_url in pre_url: self.driver.get(each_url) time.sleep(0.5) #print 'current_url',url self.driver.get(url) time.sleep(randint(0, 15) * 0.1) #print 'current_url',self.driver.current_url current_url = self.driver.current_url status = True for each_kw in arg_list: if each_kw in current_url: status = False break if len(self.driver.page_source) > min_len and status: # print u'完成网页加载...' self.goto_init() #mutex.release() return self.driver.page_source else: print 'length:', len( self.driver.page_source ), 'current_url:', self.driver.current_url[: 20], 'origon_url:', url[: 20] #self.driver.get('https://www.baidu.com/') #if len(self.driver.page_source) < 50: #print u'跳转到登陆页...' mutex.acquire() #print 'origion url',url #print 'current_url',self.driver.current_url print u'重新拨号...' aa = Adsl() if aa.reconnect(url): #time.sleep(2) self.temp += 1 self.goto_home() #time.sleep(1) print 'temp:', self.temp # time.sleep(self.temp*60) mutex.release() if self.temp == self.temp_num: print u'尝试%s次均失败,作保存处理...' % (self.temp_num - 1) self.goto_init() #mutex.release() time.sleep(randint(9, 11) * 60) return False break #else: # print u'already connected...' except Exception, e: print 'get_page err: ', e #mutex.release() return False break
def process_post_url(url, data, headers, line, position): try: res = requests.post(url, data=data, headers=headers) obj = res.json() if obj.has_key(u'message'): if not obj['success']: with open('result.txt', 'a') as f: f.write(line) write_file_positon(position) elif obj.has_key(u'error_description') and obj['error_description'] == u'您注册过于频繁,请稍后再试': print u'您注册过于频繁,请稍后再试,需换ip...' # 重连adsl ad = Adsl() ad.reconnect() while True: # 判断网络是否通 if sys.platform == 'win32': ret = os.system('ping -n 2 www.baidu.com') else: ret = os.system('ping -c 2 www.baidu.com') if not ret: break else: time.sleep(10) else: write_file_positon(position) sys.exit(0) except IOError: print 'network anomaly' time.sleep(5) except Exception: print 'url error'
def __init__(self, base_url='', temp_num=10): try: time.sleep(randint(0, 5) * 0.1) U_A = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36', 'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36' ] u_a = U_A[randint(0, len(U_A) - 1)] cap = webdriver.DesiredCapabilities.PHANTOMJS cap["phantomjs.page.settings.resourceTimeout"] = 1000 cap["phantomjs.page.settings.loadImages"] = True cap["phantomjs.page.settings.disk-cache"] = True cap["phantomjs.page.settings.userAgent"] = u_a cap["phantomjs.page.customHeaders.User-Agent"] = u_a if randint(0, 9) < 10: self.driver = webdriver.PhantomJS( executable_path='C:/phantomjs.exe', service_log_path='C:/ghostdriver.log', desired_capabilities=cap) #,desired_capabilities=cap else: # use chrome chromedriver = "C:\Users\Administrator\AppData\Local\Google\Chrome\Application\Chromedriver.exe" os.environ["webdriver.chrome.driver"] = chromedriver self.driver = webdriver.Chrome(chromedriver) # self.driver = webdriver.PhantomJS(executable_path='C:/phantomjs.exe', # service_log_path='C:/ghostdriver.log') # use ie # IEdriver='C:\Program Files (x86)\Internet Explorer\IEDriverServer.exe' # os.environ["webdriver.ie.driver"] = IEdriver # self.driver = webdriver.Ie(IEdriver) #self.driver = webdriver.Firefox() self.driver.set_page_load_timeout(2000) self.base_url = base_url if self.base_url != '': self.driver.get(self.base_url) time.sleep(0.5) if len(self.driver.page_source) < 50: aa = Adsl() aa.connect() self.temp = 0 self.temp_num = temp_num if self.temp_num == None: self.temp_num = 10 except Exception, e: print 'downloader init failed...', e
def process_post_url(url, data, headers, line, position): try: res = requests.post(url, data=data, headers=headers) obj = res.json() if obj.has_key(u'message'): if not obj['success']: with open('result.txt', 'a') as f: f.write(line) write_file_positon(position) elif obj.has_key(u'error_description' ) and obj['error_description'] == u'您注册过于频繁,请稍后再试': print u'您注册过于频繁,请稍后再试,需换ip...' # 重连adsl ad = Adsl() ad.reconnect() while True: # 判断网络是否通 if sys.platform == 'win32': ret = os.system('ping -n 2 www.baidu.com') else: ret = os.system('ping -c 2 www.baidu.com') if not ret: break else: time.sleep(10) else: write_file_positon(position) sys.exit(0) except IOError: print 'network anomaly' time.sleep(5) except Exception: print 'url error'
def work(): a = Adsl() counts = 0 while True: try: #if counts>=10: # counts=0 # a.reconnect() em = unregister_email() if em == {}: break l = signin(em['email'], em['password']) driver = l.return_driver() l.start_login() driver = activation(l.return_driver(), em['email'], l.return_name()) #path=driver.firefox_profile.path driver.quit() #shutil.rmtree(path) counts += 1 except UnexpectedAlertPresentException as e: driver.quit() a.reconnect() except Exception as e: print(e) break print('123')
def main(): # 如果不存在存放电话号码的数据表,则创建一个 init_db_if_need() phones_already = get_all_numbers() filename = raw_input('Please input filename: ') from sqlite_db import conn c = conn.cursor() count = 0 try: with open(filename, 'r') as f: while True: line = f.readline() if not line: break phone = line.strip() if phone in phones_already: print 'pass' continue payload = {'phone': phone} try: res = requests.post(url, headers=headers, data=payload) count += 1 err_code = res.json()['errorcode'] print res.json() if err_code != 0: with open('result.txt', 'a') as rf: rf.write(line) if count % 3 == 0: # 重连adsl ad = Adsl() ad.reconnect() while True: # 判断网络是否通 if sys.platform == 'win32': ret = os.system('ping -n 2 www.baidu.com') else: ret = os.system('ping -c 2 www.baidu.com') if not ret: break else: time.sleep(10) except Exception, e: print str(e) c.execute("INSERT INTO numbers VALUES (" + phone + ")") conn.commit() conn.close() print 'Done!'
def swap_o_d(i_dstCity, i_orgcity, i_startDate): # 出发城市代码 # orgcity = "PEK" # 到达城市代码 # dstCity = "CGO" # 将城市名转换成城市代码 (orgcity, dstCity) = city_to_code(i_orgcity, i_dstCity) if(orgcity == -1 or dstCity == -1): return -1 # 出发日期,格式为:2016-05-15 startDate = i_startDate # 保存数据文件名 currdatatime = datetime.datetime.now().strftime('%Y-%m-%d') filename = "HH-"+currdatatime+"-"+ startDate + ".csv" print "正在抓取和解析数据...".encode("GBK") #头等舱/公务舱,经济舱 scs = ['F','Y'] for sc in scs: for n in range(9): loaddata, status_code = data_Crawling(orgcity, dstCity, startDate,sc) #超时异常处理,若是超时将等待6分钟 if( loaddata == -1 or status_code == -1 ): print "请求超时,下次请求将在5s后进行,请耐心等待...".encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue elif( loaddata == -2 or status_code == -2 ): print "连接中断,下次请求将在5s后进行,请耐心等待...".encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue soup = BeautifulSoup(loaddata.decode('utf-8'), "html.parser") if (len(soup.find_all("tr",id="trinfo_1",class_="trinfoDetails")) < 1): if (len(soup.find_all("div",class_="flight-main")) >= 1): return 0 else: print("抓取失败,3s后尝试第%d次抓取数据.....".encode("GBK") % n) #print loaddata,status_code time.sleep(3) else: break if (n >= 8): print "抓取失败次数在太多了,无能为力...".encode("GBK") #fairleCount = fairleCount+1 return -1 # fp = open("data2.html",'w') # fp.write(loaddata) # fp.close() # fpr = open("data2.html") # loaddata = fpr.read() # fpr.close() data_analyze(loaddata, filename,i_startDate,sc)
def swap_o_d(i_dstCity, i_orgcity, i_startDate): # 出发城市代码 # orgcity = "PEK" # 到达城市代码 # dstCity = "CGO" # 将城市名转换成城市代码 (orgcity, dstCity) = city_to_code(i_orgcity, i_dstCity) if (orgcity == -1 or dstCity == -1): return -1 # 出发日期,格式为:2016-05-15 startDate = i_startDate # 保存数据文件名 currdatatime = datetime.datetime.now().strftime('%Y-%m-%d') filename = "GH-" + currdatatime + "-" + startDate + ".csv" print "正在抓取和解析数据...".encode("GBK") for n in range(9): loaddata, status_code = data_Crawling(orgcity, dstCity, startDate) #超时异常处理,若是超时将等待6分钟 if (loaddata == -1 or status_code == -1): print "请求超时,下次请求将在5s后进行,请耐心等待...".encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue elif (loaddata == -2 or status_code == -2): print "连接中断,下次请求将在5s后进行,请耐心等待...".encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue soup = BeautifulSoup(loaddata.decode('utf-8'), "html.parser") loaddata = loaddata.replace('\u003c', '<').replace('\u003e', '>') datas = re.findall(r'\"-?[\d]{3,10}\": \"(.*?)</tbody>', loaddata) if (len(datas) < 1): if (soup.title.string == "错误"): return 0 else: print("抓取失败,3s后尝试第%d次抓取数据.....".encode("GBK") % n) #print loaddata,status_code time.sleep(3) else: break if (n >= 8): print "抓取失败次数在太多了,无能为力...".encode("GBK") #fairleCount = fairleCount+1 return -1 # fp = open("data2.html",'w') # fp.write(loaddata) # fp.close() # fpr = open("data2.html") # loaddata = fpr.read() # fpr.close() data_analyze(loaddata, filename, i_startDate)
def choose_mode(self, url): url_list = [] url_list.append(url) url_list.append( re.sub(re.compile('&q=.*?&'), '&', url) + '&sku_properties=') url_list.append(re.sub(re.compile('&q=.*?&'), '&', url)) url_list.append( re.sub(re.compile('&areaId=.*'), '', re.sub(re.compile('&q=.*?&'), '&', url))) for index, each_url in enumerate(url_list): print 'each_url', each_url temp = 0 while temp < 6: aa = Adsl() aa.reconnect() self.driver.get(each_url) res = self.driver.page_source res = lxml.html.document_fromstring(res) if 'login' not in self.driver.current_url: if len( res.xpath('//span[@class="tm-price"]/text()') ) != 0 or len( res.xpath( '//li[@class="tm-ind-item tm-ind-sellCount "]/div[@class="tm-indcon"]/span[@class="tm-count"]/text()' )) != 0: print 'mode_0', index + 1 return index + 1 else: print '%d len == 0' % (index + 1) temp += 2 else: print 'login in url' time.sleep(temp) temp += 1 print 'mode_1', 4 return 4
def get_page(self, url, mutex): while True: try: # print u'开始加载网页...' response = requests.get(url, headers=self.headers, allow_redirects=False) time.sleep(1) if response.status_code == 200 and 'login' not in response.headers[ 'location']: # print u'完成网页加载...' self.goto_init() return response.text break else: mutex.acquire() print u'跳转到登陆页...' print u'重新拨号...' aa = Adsl() aa.reconnect(url) time.sleep(2) self.temp += 1 self.goto_home() time.sleep(1) mutex.release() print 'temp:', self.temp if self.temp == 51: print u'尝试50次均失败,作保存处理...' self.goto_init() time.sleep(30) return False break except Exception, e: print 'get_page err: ', e return False break
def swap_o_d(i_dstCity, i_orgcity, i_startDate): # 出发城市代码 # orgcity = "PEK" # 到达城市代码 # dstCity = "CGO" # 将城市名转换成城市代码 (orgcity, dstCity) = city_to_code(i_orgcity, i_dstCity) if (orgcity == -1 or dstCity == -1): return -1 # 出发日期,格式为:2016-05-15 startDate = i_startDate # 保存数据文件名 currdatatime = datetime.datetime.now().strftime('%Y-%m-%d') filename = "ChuanH-" + currdatatime + "-" + startDate + ".csv" print "正在抓取和解析数据...".encode("GBK") for n in range(9): loaddata, status_code = data_Crawling(orgcity, dstCity, startDate, i_orgcity, i_dstCity) #超时异常处理,若是超时将等待6分钟 if (loaddata == -1 or status_code == -1): print "请求超时,下次请求将在5s后进行,请耐心等待...".encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue elif (loaddata == -2 or status_code == -2): print "连接中断,下次请求将在5s后进行,请耐心等待...".encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue try: loaddata = json.loads(loaddata) flag = loaddata['Result'] except ValueError: #print loaddata print '页面抓取失败..,尝试重新抓取'.encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue except TypeError: #print loaddata print '页面抓取失败..,尝试重新抓取'.encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue except KeyError: #print loaddata print '页面抓取失败..,尝试重新抓取'.encode("GBK") aa = Adsl() aa.reconnect() time.sleep(5) continue #print loaddata if (flag == True): IsDirect = loaddata['IsDirect'] if (IsDirect == True): break else: return 0 else: return 0 # if (flag): # return 0 # else: # print("抓取失败,3s后尝试第%d次抓取数据....." % n) # #print loaddata,status_code # time.sleep(3) if (n >= 8): print "抓取失败次数在太多了,无能为力...".encode("GBK") #fairleCount = fairleCount+1 return -1 # fp = open("data2.html",'w') # fp.write(loaddata) # fp.close() # fpr = open("data2.html") # loaddata = fpr.read() # fpr.close() data_analyze(loaddata, filename, i_orgcity, i_dstCity)
def __init__(self, base_url='', temp_num=50, bt=0): try: os.system('taskkill /im chrome.exe /f') time.sleep(randint(0, 5) * 0.1) self.U_A = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36', 'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36' ] self.u_a = self.U_A[randint(0, len(self.U_A) - 1)] self.cap = webdriver.DesiredCapabilities.PHANTOMJS self.cap["phantomjs.page.settings.resourceTimeout"] = 1000 self.cap["phantomjs.page.settings.loadImages"] = True self.cap["phantomjs.page.settings.disk-cache"] = True self.cap["phantomjs.page.settings.userAgent"] = self.u_a self.cap["phantomjs.page.customHeaders.User-Agent"] = self.u_a if bt == 1: self.driver = webdriver.PhantomJS( executable_path='C:/phantomjs.exe', service_log_path='C:/ghostdriver.log', desired_capabilities=self.cap) #,desired_capabilities=cap elif bt == 0: # use chrome self.chromedriver = "C:\Users\Administrator\AppData\Local\Google\Chrome\Application\Chromedriver.exe" self.chrome_options = webdriver.ChromeOptions() self.chrome_options.add_argument('--headless') self.chrome_options.add_argument('--user-agent=%s' % self.u_a) self.driver = webdriver.Chrome( executable_path=self.chromedriver, chrome_options=self.chrome_options) #self.driver = webdriver.Chrome(chromedriver) #self.driver = webdriver.PhantomJS(executable_path='C:/phantomjs.exe', # service_log_path='C:/ghostdriver.log') # use ie # IEdriver='C:\Program Files (x86)\Internet Explorer\IEDriverServer.exe' # os.environ["webdriver.ie.driver"] = IEdriver # self.driver = webdriver.Ie(IEdriver) #self.driver = webdriver.Firefox() self.driver.set_page_load_timeout(500) self.base_url = base_url if self.base_url != '': self.driver.get(self.base_url) time.sleep(0.5) if len(self.driver.page_source) < 50: aa = Adsl() aa.connect() self.temp = 0 self.temp_change = 0 self.temp_num = temp_num if self.temp_num == None: self.temp_num = 10 # 三种不同跳转方式 #self.mode = 0 except Exception, e: print 'downloader init failed...', e
def get_page(self, url, pre_url, mutex): while True: #if self.mode == 0 and 'detail' in url: # self.mode = self.choose_mode(url) #if 'detail' in url: # if self.mode == 1: # url = url # elif self.mode == 2: # url = re.sub(re.compile('&q=.*?&'),'&',url) # url = re.sub(re.compile('&areaId=.*'),'',url) # elif self.mode == 3: # url = re.sub(re.compile('&q=.*?&'),'&',url) # elif self.mode == 4: # url = re.sub(re.compile('&q=.*?&'),'&',url)+'&sku_properties=' try: # print u'开始加载网页...' if len(pre_url) > 0: for each_url in pre_url: self.driver.get(each_url) time.sleep(0.5) #print 'current_url',url #if 'search_shopitem' in url: # time.sleep(randint(10,30)) self.driver.get(url) ''' my_headers = { 'Host': 'tmall.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://www.baidu.com', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } ''' time.sleep(randint(0, 15) * 0.1) #print 'current_url',self.driver.current_url res_get_page = self.driver.page_source res_get_page = lxml.html.document_fromstring(res_get_page) page_source = None if 'detail' not in url: if len( self.driver.page_source ) > 40 and 'login' not in self.driver.current_url and 'sec.taobao' not in self.driver.current_url: if u'search_shopitem' in url: products = None try: products = self.driver.find_element_by_xpath( '//div[@class="product"]') except: pass if products != None: try: ele = self.driver.find_element_by_xpath( '//div[@class="product"][1]/div/p[@class="productTitle"]/a' ) #print ele #ele.click() ele.send_keys(Keys.ENTER) #time.sleep(5) #print 'current',self.driver.current_url #self.driver.close() #self.driver.back() time.sleep(5) except: pass #print 'current',self.driver.current_url page_source = self.driver.page_source self.goto_init() #print self.driver.page_source #time.sleep(100) if page_source is None: page_source = self.driver.page_source return page_source else: if len( res_get_page.xpath( '//span[@class="tm-price"]/text()') ) != 0 or len( res_get_page.xpath( '//li[@class="tm-ind-item tm-ind-sellCount "]/div[@class="tm-indcon"]/span[@class="tm-count"]/text()' )) != 0: self.goto_init() return self.driver.page_source pai_mai = None try: pai_mai = res_get_page.xpath( '//span[@class="price"]/text()')[0] except: pass if pai_mai != None: self.goto_init() return self.driver.page_source print 'length:', len( self.driver.page_source ), 'current_url:', self.driver.current_url[: 20], 'origon_url:', url[: 20] mutex.acquire() print u'重新拨号...' aa = Adsl() aa.reconnect() self.temp += 1 self.goto_home() print 'temp:', self.temp mutex.release() if self.temp > 4: if self.temp_change < 5 or self.temp % 10 == 0: print 'kill a driver' self.driver.quit() self.u_a = self.U_A[randint(0, len(self.U_A) - 1)] #self.cap["phantomjs.page.settings.userAgent"] = self.u_a #self.cap["phantomjs.page.customHeaders.User-Agent"] = self.u_a #self.driver = webdriver.PhantomJS(executable_path='C:/phantomjs.exe',service_log_path='C:/ghostdriver.log',desired_capabilities=self.cap) self.chrome_options.add_argument('--user-agent=%s' % self.u_a) self.driver = webdriver.Chrome( executable_path=self.chromedriver, chrome_options=self.chrome_options) self.temp_change += 1 if self.temp % 10 == 0 and self.temp != 0: wait = randint((self.temp / 10) * 300, (self.temp / 10) * 600) print 'wait %s s...' % str(wait) print 'kill a driver' self.driver.quit() self.u_a = self.U_A[randint(0, len(self.U_A) - 1)] self.chrome_options.add_argument('--user-agent=%s' % self.u_a) time.sleep(wait) self.driver = webdriver.Chrome( executable_path=self.chromedriver, chrome_options=self.chrome_options) if self.temp == self.temp_num: print u'尝试%s次均失败,作保存处理...' % (self.temp_num - 1) self.driver.quit() self.u_a = self.U_A[randint(0, len(self.U_A) - 1)] self.chrome_options.add_argument('--user-agent=%s' % self.u_a) self.goto_init() #self.mode = 0 time.sleep(randint(9, 15) * 60) self.driver = webdriver.Chrome( executable_path=self.chromedriver, chrome_options=self.chrome_options) except Exception, e: print 'get_page err: ', e return False