Beispiel #1
0
def getcodeinseturl(url, num_retries=2):
    myTime = time.strftime('%Y%m%d',
                           time.localtime(time.time() - 24 * 60 * 60))
    try:
        # print(myTime)
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = UserAgents.getRandAgent()
        dcap["phantomjs.page.settings.resourceTimeout"] = "5000"
        dcap["phantomjs.page.settings.loadImages"] = False

        # print(url)
        # exit()
        driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                     service_args=service_args)
        driver.implicitly_wait(5)
        driver.set_page_load_timeout(10)
        driver.set_script_timeout(10)
        driver.get(url)

        #print('dfasdfasdfadfasdfasdfasd')
        #exit()
        # print(driver.page_source)instwenzhang
        #print(u'============================打开搜狗微信公众号文章列表页面=============================')
        print(u"open public code page")

        bsObj = BeautifulSoup(driver.page_source, 'html.parser')
        _bool = isElementExist(bsObj, 'weui_media_title')

        if _bool == False:
            driver.quit()
            (url)
            return

        urlList = driver.find_elements_by_class_name('weui_media_title')
        tmpurllist = []
        for i in urlList:
            tmpurllist.append(i.get_attribute('hrefs'))

        #print(u'=============================返回微信公众号文章列表Link========================')
        print(u"return public code article list")
        return tmpurllist

    except Exception as e:
        print(e)
        #print(u'-----------------------------------文章异常处理')
        print(u"artilce note")
        tmpurllist = []
        if num_retries > 0:
            return getcodeinseturl(url, num_retries - 1)
Beispiel #2
0
def download(url, num_retries=2, user_agent=''):
    try:
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        # dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53  (KHTML, like Gecko) Chrome/15.0.87")
        dcap["phantomjs.page.settings.userAgent"] = UserAgents.getRandAgent()
        dcap["phantomjs.page.settings.resourceTimeout"] = "5000"
        dcap["phantomjs.page.settings.loadImages"] = False

        driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                     service_args=service_args)
        driver.implicitly_wait(5)
        driver.set_page_load_timeout(10)
        driver.set_script_timeout(10)
        driver.get(url)
        # print(driver.page_source)
        #print(u'=====================================打开搜狗搜索页面=====================================')

        bsObj = BeautifulSoup(driver.page_source, 'html.parser')

        #判断是否有此公众号和是否ip限制,如果找不到元素那么抛出异常
        wx_rb = isElementExist(bsObj, 'wx-rb')

        if wx_rb == False:
            driver.quit()
            download(url)
            return

        #返回页面HTML
        html = driver.page_source.encode('utf-8')
        #print(u"=====================================返回公众号搜索页面=====================================")
        return html
    except Exception as e:
        print(e)
        # 抛出的异常接住,再次调用
        # n = num_retries-1
        #print(u'-------------------------------------公众号异常处理')
        html = None
        if num_retries > 0:
            return download(url, num_retries - 1)