def getcodeinseturl(url, num_retries=2): myTime = time.strftime('%Y%m%d', time.localtime(time.time() - 24 * 60 * 60)) try: # print(myTime) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = UserAgents.getRandAgent() dcap["phantomjs.page.settings.resourceTimeout"] = "5000" dcap["phantomjs.page.settings.loadImages"] = False # print(url) # exit() driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args) driver.implicitly_wait(5) driver.set_page_load_timeout(10) driver.set_script_timeout(10) driver.get(url) #print('dfasdfasdfadfasdfasdfasd') #exit() # print(driver.page_source)instwenzhang #print(u'============================打开搜狗微信公众号文章列表页面=============================') print(u"open public code page") bsObj = BeautifulSoup(driver.page_source, 'html.parser') _bool = isElementExist(bsObj, 'weui_media_title') if _bool == False: driver.quit() (url) return urlList = driver.find_elements_by_class_name('weui_media_title') tmpurllist = [] for i in urlList: tmpurllist.append(i.get_attribute('hrefs')) #print(u'=============================返回微信公众号文章列表Link========================') print(u"return public code article list") return tmpurllist except Exception as e: print(e) #print(u'-----------------------------------文章异常处理') print(u"artilce note") tmpurllist = [] if num_retries > 0: return getcodeinseturl(url, num_retries - 1)
def download(url, num_retries=2, user_agent=''): try: dcap = dict(DesiredCapabilities.PHANTOMJS) # dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 (KHTML, like Gecko) Chrome/15.0.87") dcap["phantomjs.page.settings.userAgent"] = UserAgents.getRandAgent() dcap["phantomjs.page.settings.resourceTimeout"] = "5000" dcap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args) driver.implicitly_wait(5) driver.set_page_load_timeout(10) driver.set_script_timeout(10) driver.get(url) # print(driver.page_source) #print(u'=====================================打开搜狗搜索页面=====================================') bsObj = BeautifulSoup(driver.page_source, 'html.parser') #判断是否有此公众号和是否ip限制,如果找不到元素那么抛出异常 wx_rb = isElementExist(bsObj, 'wx-rb') if wx_rb == False: driver.quit() download(url) return #返回页面HTML html = driver.page_source.encode('utf-8') #print(u"=====================================返回公众号搜索页面=====================================") return html except Exception as e: print(e) # 抛出的异常接住,再次调用 # n = num_retries-1 #print(u'-------------------------------------公众号异常处理') html = None if num_retries > 0: return download(url, num_retries - 1)