Beispiel #1
0
class BaiduNews:
    def __init__(self):
        self.SqlH = SqlHelper()
        self.SqlH.init_db('baiduNews')

    def news_crawl(self):
        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()

        type = ('focus-top', 'local_news', 'guonei', 'guojie', 'caijing',
                'yule', 'tiyu', 'col-auto', 'col-house', 'hulianwang',
                'internet-plus', 'col-tech', 'col-edu', 'col-game',
                'col-discovery', 'col-healthy', 'col-lady', 'shehui', 'junshi',
                'tupianxinwen')
        browser = webdriver.PhantomJS()
        browser.get('http://news.baidu.com/')
        js1 = 'return document.body.scrollHeight'
        js2 = 'window.scrollTo(0, document.body.scrollHeight)'
        old_scroll_height = 0
        while (browser.execute_script(js1) > old_scroll_height):
            old_scroll_height = browser.execute_script(js1)
            browser.execute_script(js2)
            time.sleep(0.8)
        html = browser.page_source
        tree = etree.HTML(html)
        updatetime = time.strftime('%Y/%m/%d %H:%M:%S',
                                   time.localtime(time.time()))
        #print(updatetime)
        for item in type:
            regularExpressionUrl = '//div[@id="' + item + '"]//li/a/@href'
            regularExpressionText = '//div[@id="' + item + '"]//li/a/text()'
            news_url = tree.xpath(regularExpressionUrl)
            news_text = tree.xpath(regularExpressionText)
            #print('url_len'+str(len(news_url)))
            # print('text_len'+str(len(news_text)))
            for i in range(0, len(news_text)):
                if 'http' in news_url[i]:
                    newsContent = {
                        'title': news_text[i],
                        'url': news_url[i],
                        'content': '',
                        'category': item,
                        'secCategory': '',
                        'image': '',
                        'time': updatetime,
                        'from': 'BD'
                    }
                    if self.SqlH.count({'title': news_text[i]}) == 0:
                        self.SqlH.insert(newsContent)

        # 首页热点新闻模块
        browser.quit()
Beispiel #2
0
class WXSpider():
    def __init__(self):
        self.type=['hot','local','shehui','guonei','guoji','recomment','junshi','finance','technology','sports','fashionbang','fashionbang','auto_moto','fangcan','technology','yangshengtang']
        self.SqlH= SqlHelper()
        self.SqlH.init_db('weixin')
        self.page=2
        self.current_type=''
    def spider(self,inde=None):
        dcap=dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = config.get_header()
        browser = webdriver.PhantomJS(desired_capabilities=dcap)
        #browser = webdriver.Chrome('/home/caidong/developProgram/selenium/chromedriver')
        browser.get('http://news.163.com/')
        #print(browser.page_source)
        for i in range(1,10):
            if i<9:
                bt_mouseover = browser.find_element_by_xpath('//li[@class="nav_item"]['+str(i)+']/a')
                actions =ActionChains(browser)
                actions.move_to_element(bt_mouseover).perform()
                browser.implicitly_wait(5)
                time.sleep(5)
                html = browser.page_source
                #print(html)
                self.current_type=self.type[i]
                self.parse(html)
            else:
                more = browser.find_elements_by_xpath('//div[@class="more_list"]/a')
                i=1
                for item in more:
                    if i < 2:
                        bt_mouseover = browser.find_element_by_xpath('//a[@class="more"]')
                    else:
                        bt_mouseover = browser.find_element_by_xpath('//a[@class="more more_current"]')
                    i += 1
                    actions = ActionChains(browser)
                    actions.move_to_element(bt_mouseover).perform()
                    time.sleep(60)
                    browser.implicitly_wait(50)
                    try:
                        item.click()
                    except:
                        print ("click error")
                    browser.implicitly_wait(15)
                    html = browser.page_source
                    self.current_type = self.type[i+6]
                    print(self.current_type)
                    #print(html)
                    self.parse(html)
                    #actions.click(item)
                    time.sleep(2)

       # browser.get_screenshot_as_file('1.png')
        #print(browser.page_source)
        #exit()
        # if index <= 6:
        #     bt_element=('//div[@class="fieed-box"]/a[@id="%s"]'%xpath_str)
        # else:
        #     actions = ActionChains(browser)
        #     more = browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="more_anchor"]')
        #     actions.move_to_element(more).perform()
        #     bt_element=('//div[@class="tab-box-pop"]/a[@id="%s"]'%xpath_str)
        # #if index > 6:
          #  browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="pc_6"]').click()
           # time.sleep(2)


        #time.sleep(2)
        #actions.move_to_element(more).perform()
        # browser.find_element_by_xpath(bt_element).click()
        # time.sleep(2)
        #
        # #browser.get_screenshot_as_file('tex.png')
        # js1 = 'return document.body.scrollHeight'
        # js2 = 'window.scrollTo(0, document.body.scrollHeight)'
        # old_scroll_height = 0
        # while(browser.execute_script(js1) > old_scroll_height):
        #     old_scroll_height = browser.execute_script(js1)
        #     browser.execute_script(js2)
        #     time.sleep(0.8)
        # for i in range(self.page):
        #     load_more_xpath='//div[@class="jzgd"]/a'
        #     browser.find_element_by_xpath(load_more_xpath).click()
        #     time.sleep(2)

        browser.quit()
    def parse(self,html):
        tree = etree.HTML(html)
        updatetime = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time()))
        news_content = tree.xpath("//div[@class='data_row news_photoview clearfix ']|//div[@class='data_row news_article clearfix ']")
        for item in news_content:
            content = etree.ElementTree(item)
            imgUrl =content.xpath("//img/@src")
            txtTitle = content.xpath("//h3/a/text()")
            detail_url = content.xpath("//h3/a/@href")
            print(imgUrl)
            print(txtTitle)
            print(detail_url)
        wxContent = {'title': txtTitle, 'url': detail_url, 'content': '',
                     'category': self.current_type,
                     'secCategory': '', 'image': imgUrl, 'time': updatetime, 'from': 'WX'}
        if self.SqlH.count({'title': txtTitle}) == 0:
            self.SqlH.insert(wxContent)