def get_user_info(user, fail_time=0): print u'找到用户',user ,u'的评论, 正在查询', user, u'的星级' base_url = config.STAR_INFO_URL url = base_url #+ quote(user.encode('utf-8', 'ignore')) allow_star = range(1, config.MAX_STAR + 1) try: time.sleep(1) driver = config.DRIVER driver.get(url) element = driver.find_element_by_id('txt_name') element.send_keys(user, Keys.ENTER) WebDriverWait(driver, config.TIMEOUT).until( EC.visibility_of_element_located((By.ID, "buyer_ratecount")) ) html = driver.page_source time.sleep(3) pattern = re.compile('<span id="buyer_ratecount.*?src="(.*?)gif', re.S) result = re.search(pattern, html) print result.group(0) src = result.group(0) #driver.get('http://www.baidu.com') for allow in allow_star: if 'b_red_'+ str(allow) in src: print u'该用户', user, u'星级符合要求' return True print u'该用户', user, u'星级不符合要求' return False except TimeoutException: print u'查询失败, 正在重试' print u'请打开 http://www.taoyitu.com/ 输入验证码,即可迅速解决问题' fail_time = fail_time + 1 if fail_time == 3: if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此用户' return False return get_user_info(user, fail_time) except (socket.error, urllib2.URLError): print u'查询失败, 跳过该用户' return False except NoSuchElementException: print u'查询星级失败, 正在重试' print u'请打开 http://www.taoyitu.com/ 输入验证码,即可迅速解决问题' if fail_time >=2 : print u'请求超时, 正在切换代理, 继续重试' update_proxy_pool() new_proxy_driver() else: print u'请求超时,正在切换会话重试' new_driver() time.sleep(3) fail_time = fail_time + 1 if fail_time == 5: if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此用户' return False return get_user_info(user, fail_time)
def scrap(url, fail_time=0, use_driver=True): timeout = config.TIMEOUT print u'正在请求', url, u', 请稍后...' try: if use_driver: driver = config.DRIVER driver.get(url) WebDriverWait(driver, timeout).until( EC.presence_of_element_located((By.ID, "content")) # EC.presence_of_element_located((By.ID, "J_TabRecommends")) ) result = get_recommends(driver, config.MAX_TRY) if result: html = driver.page_source else: html = requests.get(url).text if html: print u'查找成功' print "html:" print html return html else: print u'请求超时, 获取失败, 此页面不存在相应内容' except TimeoutException: if fail_time >= 2: print u'请求超时, 正在切换代理, 继续重试' update_proxy_pool() new_proxy_driver() else: print u'请求超时,正在切换会话重试' new_driver() fail_time = fail_time + 1 if config.CONSOLE_OUTPUT: print u'当前页面请求失败数', fail_time if fail_time == config.MAX_FAIL: update_proxy_pool() if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此请求' return False scrap(url, fail_time) except (socket.error, urllib2.URLError): print u'请求页面过于频繁, 请求被中断, 正在切换会话重试' new_driver() fail_time = fail_time + 1 if config.CONSOLE_OUTPUT: print u'当前页面请求失败数', fail_time if fail_time == config.MAX_FAIL: if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此请求' return False scrap(url, fail_time) except (WindowsError, OSError, Exception): print u'未知错误, 跳过继续运行'
def scrap(url, fail_time=0): timeout = config.TIMEOUT print u'正在请求', url, u', 请稍后...' try: driver = config.DRIVER driver.get(url) WebDriverWait(driver, timeout).until( EC.presence_of_element_located((By.ID, "J_TabRecommends")) ) result = get_recommends(driver, config.MAX_TRY) if result: print u'查找成功' html = driver.page_source parse_content(html) else: print u'请求超时, 获取失败, 此页面不存在相应内容' except TimeoutException: if fail_time >=2 : print u'请求超时, 正在切换代理, 继续重试' update_proxy_pool() new_proxy_driver() else: print u'请求超时,正在切换会话重试' new_driver() fail_time = fail_time + 1 if config.CONSOLE_OUTPUT: print u'当前页面请求失败数', fail_time if fail_time == config.MAX_FAIL: update_proxy_pool() if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此请求' return False scrap(url, fail_time) except (socket.error, urllib2.URLError): print u'请求页面过于频繁, 请求被中断, 正在切换会话重试' new_driver() fail_time = fail_time + 1 if config.CONSOLE_OUTPUT: print u'当前页面请求失败数', fail_time if fail_time == config.MAX_FAIL: if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此请求' return False scrap(url, fail_time) except (WindowsError, OSError, Exception): print u'未知错误, 跳过继续运行'
def get_product(url, fail_time=0): try: print u'当前宝贝在第', config.NOW_COUNT + 1, u'个网页, 共', config.TOTAL_COUNT, u'个网页' time.sleep(1) driver = config.DRIVER driver.get(url) WebDriverWait(driver, config.TIMEOUT).until( EC.presence_of_element_located((By.TAG_NAME, "title")) ) html = driver.page_source doc = pq(html) title = doc('title').text() return title except TimeoutException: if fail_time >= 2: print u'请求超时, 正在切换代理, 继续重试' update_proxy_pool() new_proxy_driver() else: print u'请求超时,正在切换会话重试' new_driver() fail_time = fail_time + 1 if config.CONSOLE_OUTPUT: print u'当前失败次数', fail_time if fail_time == config.MAX_FAIL: update_proxy_pool() if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此请求' return False get_product(url, fail_time) except (socket.error, urllib2.URLError): print u'请求宝贝过于频繁, 请求被中断, 正在切换会话重试' new_driver() fail_time = fail_time + 1 if config.CONSOLE_OUTPUT: print u'当前失败次数', fail_time if fail_time == config.MAX_FAIL: if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此请求' return False get_product(url, fail_time) except (WindowsError, OSError, Exception): print u'未知错误, 跳过继续运行'
def get_product(url, fail_time=0): try: print u'当前宝贝在第', config.NOW_COUNT + 1, u'个网页, 共', config.TOTAL_COUNT, u'个网页' time.sleep(1) driver = config.DRIVER driver.get(url) WebDriverWait(driver, config.TIMEOUT).until( EC.presence_of_element_located((By.TAG_NAME, "title"))) html = driver.page_source doc = pq(html) title = doc('title').text() return title except TimeoutException: if fail_time >= 2: print u'请求超时, 正在切换代理, 继续重试' update_proxy_pool() new_proxy_driver() else: print u'请求超时,正在切换会话重试' new_driver() fail_time = fail_time + 1 if config.CONSOLE_OUTPUT: print u'当前失败次数', fail_time if fail_time == config.MAX_FAIL: update_proxy_pool() if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此请求' return False get_product(url, fail_time) except (socket.error, urllib2.URLError): print u'请求宝贝过于频繁, 请求被中断, 正在切换会话重试' new_driver() fail_time = fail_time + 1 if config.CONSOLE_OUTPUT: print u'当前失败次数', fail_time if fail_time == config.MAX_FAIL: if config.CONSOLE_OUTPUT: print u'失败次数过多, 跳过此请求' return False get_product(url, fail_time) except (WindowsError, OSError, Exception): print u'未知错误, 跳过继续运行'
# -*- coding: utf-8 -*- from proxy.getproxy import update_proxy_pool from proxy.updateproxy import update_proxy if __name__ == "__main__": update_proxy_pool()