コード例 #1
0
class apiNews:
    def __init__(self):
        self.sqlhelper = SqlHelper()

    def queryNews(self, category, pz, page, db_name):
        self.sqlhelper.init_db(db_name)
        newsJson = self.sqlhelper.select(pz, {'category': category}, page)
        self.sqlhelper.close_client()
        return newsJson
コード例 #2
0
class Article():
    def __init__(self):
        self.black_page = 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84'
        self.start_url = 'https://zhuanlan.zhihu.com/yinjiaoshou886/answer'
        self.browser = webdriver.Chrome(
            executable_path='/home/caidong/developProgram/selenium/chromedriver'
        )
        self.SqlH = SqlHelper()
        self.SqlH.init_db('zhihu', 'zhihu_all')
        self.base_url = 'https://www.zhihu.com'
        self.user_home_url = ''
        self.current = 1

    def crawl(self, url):
        self.browser.get(url)
        if self.browser.current_url == self.black_page:
            print("输入验证")
            sys.exit()

        if self.current == 2:
            time.sleep(30)
        self.current = self.current + 1

        time.sleep(3)
        self.browser.implicitly_wait(3)
        self.parse_special_column(self.browser.page_source,
                                  self.browser.current_url)

    def parse_special_column(self, html, url):
        tree = etree.HTML(html)
        comment_list = tree.xpath('//div[@class="ContentItem-actions"]')
        if len(comment_list) > 4:
            comment_list = comment_list[1:4]
        article_list = []
        for item in comment_list:
            item = etree.ElementTree(item)
            answer_comment = item.xpath(
                '//button[@class="Button ContentItem-action Button--plain"]/text()'
            )[0]
            if str(answer_comment).startswith('添加'):
                answer_comment = 0
            else:
                answer_comment = str(answer_comment).strip()[:-3]
            print(answer_comment)
            article_list.append(answer_comment)

        if len(article_list) == 0:
            article_list = "none"
        self.SqlH.update({"user_home_url": self.user_home_url}, {
            "article_comment": article_list,
        })
        print(article_list)
コード例 #3
0
ファイル: baiduCrawl.py プロジェクト: caidongHui/baiduNews
class BaiduNews:
    def __init__(self):
        self.SqlH = SqlHelper()
        self.SqlH.init_db('baiduNews')

    def news_crawl(self):
        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()

        type = ('focus-top', 'local_news', 'guonei', 'guojie', 'caijing',
                'yule', 'tiyu', 'col-auto', 'col-house', 'hulianwang',
                'internet-plus', 'col-tech', 'col-edu', 'col-game',
                'col-discovery', 'col-healthy', 'col-lady', 'shehui', 'junshi',
                'tupianxinwen')
        browser = webdriver.PhantomJS()
        browser.get('http://news.baidu.com/')
        js1 = 'return document.body.scrollHeight'
        js2 = 'window.scrollTo(0, document.body.scrollHeight)'
        old_scroll_height = 0
        while (browser.execute_script(js1) > old_scroll_height):
            old_scroll_height = browser.execute_script(js1)
            browser.execute_script(js2)
            time.sleep(0.8)
        html = browser.page_source
        tree = etree.HTML(html)
        updatetime = time.strftime('%Y/%m/%d %H:%M:%S',
                                   time.localtime(time.time()))
        #print(updatetime)
        for item in type:
            regularExpressionUrl = '//div[@id="' + item + '"]//li/a/@href'
            regularExpressionText = '//div[@id="' + item + '"]//li/a/text()'
            news_url = tree.xpath(regularExpressionUrl)
            news_text = tree.xpath(regularExpressionText)
            #print('url_len'+str(len(news_url)))
            # print('text_len'+str(len(news_text)))
            for i in range(0, len(news_text)):
                if 'http' in news_url[i]:
                    newsContent = {
                        'title': news_text[i],
                        'url': news_url[i],
                        'content': '',
                        'category': item,
                        'secCategory': '',
                        'image': '',
                        'time': updatetime,
                        'from': 'BD'
                    }
                    if self.SqlH.count({'title': news_text[i]}) == 0:
                        self.SqlH.insert(newsContent)

        # 首页热点新闻模块
        browser.quit()
コード例 #4
0
 def __init__(self):
     self.black_page = 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84'
     self.start_url = 'https://zhuanlan.zhihu.com/yinjiaoshou886/answer'
     self.browser = webdriver.Chrome(
         executable_path='/home/caidong/developProgram/selenium/chromedriver'
     )
     self.SqlH = SqlHelper()
     self.SqlH.init_db('zhihu', 'zhihu_all')
     self.base_url = 'https://www.zhihu.com'
     self.user_home_url = ''
     self.current = 1
コード例 #5
0
class Zhihuhomepage():
    def __init__(self):
        self.SqlH = SqlHelper()
        self.SqlH.init_db('zhihu')
        self.base_url = 'https://www.zhihu.com'

    def gethomepage(self,url,user_name):
        req = request.Request( url=url,headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
    })
        html=request.urlopen(req,timeout=5).read().decode('utf-8')
        tree = etree.HTML(html)
        collecter = tree.xpath("//div[@class='Profile-sideColumnItemValue']/text()")
        # time.sleep(2)
        if collecter:
            save = collecter[0][:-3].strip()
        else:
            save = 0
        print(user_name, collecter)
        self.updateCollect(user_name,save)
        #print(html)
    def updateCollect(self,user_name,save):
        self.SqlH.update({'user_name': user_name}, {'collect': save})
    def fromdb(self):
        tottal_s=self.SqlH.count(condition={'collect':'none'})
        print(tottal_s)
        for c_page in range(tottal_s):
            time.sleep(2)
            result=self.SqlH.select(conditions={'collect':'none'},count=1,page=c_page)
            self.gethomepage(self.base_url+result[0]['home_page'],result[0]['user_name'])
        pass
コード例 #6
0
 def __init__(self):
     self.start_url = 'https://www.zhihu.com/people/kaifulee/activities'
     self.base_url = 'https://www.zhihu.com'
     self.type = [
         'hot', 'local', 'shehui', 'guonei', 'guoji', 'recomment', 'junshi',
         'finance', 'technology', 'sports', 'fashionbang', 'fashionbang',
         'auto_moto', 'fangcan', 'technology', 'yangshengtang'
     ]
     self.SqlH = SqlHelper()
     self.SqlH.init_db('zhihu')
     self.page = 2
     self.totla_url_set = set()
     self.wait_use_url_set = set()
     self.current_type = ''
コード例 #7
0
 def __init__(self):
     self.black_page = 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84'
     self.start_url = 'https://www.zhihu.com/people/kaifulee/followers?page=25583'
     #self.start_url = 'https://www.zhihu.com/people/ji-da-fa-37/activities'
     self.base_url = 'https://www.zhihu.com'
     self.SqlH = SqlHelper()
     self.SqlH.init_db('zhihu','zhihu_48000')
     #self.browser = webdriver.PhantomJS()
     # proxy = {'address': '60.168.104.30:3128',
     #          'username': '******',
     #          'password': '******'
     #           }
     # capabilities = dict(DesiredCapabilities.CHROME)
     # capabilities['proxy'] = {'proxyType': 'MANUAL',
     #                          'httpProxy': proxy['address'],
     #                          'ftpProxy': proxy['address'],
     #                          'sslProxy': proxy['address'],
     #                          'noProxy': '',
     #                          'class': "org.openqa.selenium.Proxy",
     #                          'autodetect': False}
     #
     # capabilities['proxy']['httpUsername'] = proxy['username']
     # capabilities['proxy']['httpPassword'] = proxy['password']
     # chromeOptions = webdriver.ChromeOptions()
     # chromeOptions.add_argument('--proxy-server=http://60.168.104.30:3128')
     #self.browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path='/home/caidong/developProgram/selenium/chromedriver')
     #self.browser = webdriver.PhantomJS()
     #cookies = ZhihuLogin().login()
     #print(cookies)
     self.browser = webdriver.PhantomJS()
     self.browser = webdriver.Chrome(executable_path='/home/caidong/developProgram/selenium/chromedriver')
     #for cookie in cookies:
       #  self.browser.add_cookie({cookie['name']:cookie['value']})
     #self.browser.add_cookie(cookie)
     time.sleep(5)
     print('cookie',self.browser.get_cookies())
     #print(self.browser.get_cookies())
     #self.browser.add_cookie({"cookie":'_zap=b24c85f0-aae0-456a-ba87-e0919de79409; __utma=243313742.618834370.1505397831.1505397831.1505431589.2; __utmz=243313742.1505397831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); d_c0="AJCCExnEYAyPTuiuB47mCQN_anS_LW2ZmQI=|1505432287"; q_c1=f92e81f1440d49eca643b9bd71df1d06|1505471670000|1502586350000; aliyungf_tc=AQAAABpahiv+pQIA4wmi0wpuOA0ptCdt; __utma=51854390.226003310.1505817316.1505817316.1505817316.1; __utmc=51854390; __utmz=51854390.1505817316.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=51854390.000--|3=entry_date=20170813=1; XSRF-TOKEN=2|02bd5b9f|30893afa3ad96af92f8d3ffb67906faa338d76fe308d3fb267de6cad358569a837dc39ae|1505824255; _xsrf=24ae8d1f-0dde-4510-a20d-ec7278275ab1; l_cap_id="NDYzOWZmNjBmZDhjNDBkZWI5MDg0NjYyZDk4YTk2OTA=|1505824625|220e4527cbfe214589599d071685e4c7f62143fc"; r_cap_id="NWJhOTRmYzg2NTVlNDczY2ExZWY3YzgxNGQ2ZmRmM2I=|1505824625|b050327da2a8dedc37a8e744640b60b553f3b771"; cap_id="YjcyNGZkYjFlY2JkNDU3ZWFlYmQ0NjQ3ZDJmNDcwZjk=|1505824625|5804f3f4999cf311334c3664f2e41ad2d4d93029'})
     self.start_page = 48000
     self.end_page = 47000
コード例 #8
0
 def __init__(self):
     self.SqlH = SqlHelper()
     self.SqlH.init_db('zhihu')
     self.base_url = 'https://www.zhihu.com'
コード例 #9
0
class ZHSpider():
    def __init__(self):
        self.black_page = 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84'
        self.start_url = 'https://www.zhihu.com/people/kaifulee/followers?page=25583'
        #self.start_url = 'https://www.zhihu.com/people/ji-da-fa-37/activities'
        self.base_url = 'https://www.zhihu.com'
        self.SqlH = SqlHelper()
        self.SqlH.init_db('zhihu','zhihu_48000')
        #self.browser = webdriver.PhantomJS()
        # proxy = {'address': '60.168.104.30:3128',
        #          'username': '******',
        #          'password': '******'
        #           }
        # capabilities = dict(DesiredCapabilities.CHROME)
        # capabilities['proxy'] = {'proxyType': 'MANUAL',
        #                          'httpProxy': proxy['address'],
        #                          'ftpProxy': proxy['address'],
        #                          'sslProxy': proxy['address'],
        #                          'noProxy': '',
        #                          'class': "org.openqa.selenium.Proxy",
        #                          'autodetect': False}
        #
        # capabilities['proxy']['httpUsername'] = proxy['username']
        # capabilities['proxy']['httpPassword'] = proxy['password']
        # chromeOptions = webdriver.ChromeOptions()
        # chromeOptions.add_argument('--proxy-server=http://60.168.104.30:3128')
        #self.browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path='/home/caidong/developProgram/selenium/chromedriver')
        #self.browser = webdriver.PhantomJS()
        #cookies = ZhihuLogin().login()
        #print(cookies)
        self.browser = webdriver.PhantomJS()
        self.browser = webdriver.Chrome(executable_path='/home/caidong/developProgram/selenium/chromedriver')
        #for cookie in cookies:
          #  self.browser.add_cookie({cookie['name']:cookie['value']})
        #self.browser.add_cookie(cookie)
        time.sleep(5)
        print('cookie',self.browser.get_cookies())
        #print(self.browser.get_cookies())
        #self.browser.add_cookie({"cookie":'_zap=b24c85f0-aae0-456a-ba87-e0919de79409; __utma=243313742.618834370.1505397831.1505397831.1505431589.2; __utmz=243313742.1505397831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); d_c0="AJCCExnEYAyPTuiuB47mCQN_anS_LW2ZmQI=|1505432287"; q_c1=f92e81f1440d49eca643b9bd71df1d06|1505471670000|1502586350000; aliyungf_tc=AQAAABpahiv+pQIA4wmi0wpuOA0ptCdt; __utma=51854390.226003310.1505817316.1505817316.1505817316.1; __utmc=51854390; __utmz=51854390.1505817316.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=51854390.000--|3=entry_date=20170813=1; XSRF-TOKEN=2|02bd5b9f|30893afa3ad96af92f8d3ffb67906faa338d76fe308d3fb267de6cad358569a837dc39ae|1505824255; _xsrf=24ae8d1f-0dde-4510-a20d-ec7278275ab1; l_cap_id="NDYzOWZmNjBmZDhjNDBkZWI5MDg0NjYyZDk4YTk2OTA=|1505824625|220e4527cbfe214589599d071685e4c7f62143fc"; r_cap_id="NWJhOTRmYzg2NTVlNDczY2ExZWY3YzgxNGQ2ZmRmM2I=|1505824625|b050327da2a8dedc37a8e744640b60b553f3b771"; cap_id="YjcyNGZkYjFlY2JkNDU3ZWFlYmQ0NjQ3ZDJmNDcwZjk=|1505824625|5804f3f4999cf311334c3664f2e41ad2d4d93029'})
        self.start_page = 48000
        self.end_page = 47000
    def crawlData(self, url=None):
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap[
            "phantomjs.page.settings.userAgent"] = config.get_header()
        # browser = webdriver.PhantomJS(desired_capabilities=dcap)
        #browser =webdriver.Firefox()
        self.browser.get(url)
        i=1
        if i==1:
            time.sleep(30)
            i=i+1
        # print(browser.page_source)
        self.browser.implicitly_wait(3)
        print('cookie',self.browser.get_cookies())
        print(self.browser.page_source)
        # 点击关注者
        self.browser.find_element_by_xpath('//div[@class="NumberBoard FollowshipCard-counts"]').click()
        self.browser.implicitly_wait(3)
        more = self.browser.find_elements_by_xpath('//button[@class="Button PaginationButton Button--plain"]')
        more[-1].click()
        self.browser.implicitly_wait(3)
        total_page = more[-1].text
        c_page = self.browser.find_element_by_xpath('//button[@class="Button PaginationButton PaginationButton--current Button--plain"]').text
        print(c_page)
        for curren_page in range(int(total_page)):
                c_page = self.browser.find_element_by_xpath(
                '//button[@class="Button PaginationButton PaginationButton--current Button--plain"]').text
                print(c_page)
                print('当前页:', str(curren_page))

                #点击上一页
                # self.browser.find_element_by_xpath(
                #     '//button[@class="Button PaginationButton PaginationButton-prev Button--plain"]').click()
               #点击下一页
                self.browser.find_element_by_xpath(
                    '//Button PaginationButton PaginationButton-next Button--plain"]').click()
                self.browser.implicitly_wait(3)
                if int(c_page) < self.start_page and int(c_page) > self.end_page:
                 try:
                    self.loop_list()

                 except:
                          print('循环点击列表出错')
    #循环解析当前列表
    def loop_list(self):
        items = self.browser.find_elements_by_xpath('//div[@class="ContentItem-head"]//a[@class="UserLink-link"]')
        print("数目",len(items))
        for item in items:
            time.sleep(random.randrange(5))
            c_url= item.get_attribute("href")
            print("c_url",c_url)
            if self.SqlH.count({"home_page":c_url+'/activities'})==0:
                #不出现点击点问题
                while not item.is_displayed():
                    time.sleep(1)
                try:
                    item.click()
                except:
                    print('点击错误')
                self.browser.implicitly_wait(1)
                handle_cnt = len(self.browser.window_handles) - 1
                # print('标签数',handle_cnt)
                self.browser.switch_to.window(self.browser.window_handles[handle_cnt])
                print(self.browser.current_url)
                if  self.browser.current_url==self.black_page:
                    time.sleep(10*60)
                try:
                    self.browser.implicitly_wait(3)
                    self.parse_home_page(self.browser.page_source, self.browser.current_url)
                except:
                     print("页面解析错误")
                if handle_cnt > 0:
                    self.browser.close()
                    self.browser.switch_to.window(self.browser.window_handles[0])
            else:
                print("已存在")
            time.sleep(random.randrange(2))
    #存储数据到mongodb
    def storage_mongod(self,dic):
        user_name =dic.get("user_name")
        if self.SqlH.count({'user_name': user_name}) == 0:
            self.SqlH.insertZhiHu(dic)
        else:
            self.SqlH.update({'user_name': user_name}, {'collect': dic.get('collect')})
        pass

    def parse_home_page(self, html,url):
        tree = etree.HTML(html)
        follow = tree.xpath("//div[@class='NumberBoard-value']/text()")
        if follow:
            flowing = follow[0]
            follower = follow[1].strip()
        else:
            flowing = 'none'
            follower = 'none'
        page_header = tree.xpath("//div[@class='Card ProfileMain']//ul[@class='Tabs ProfileMain-tabs']/li[@class='Tabs-item']/a/span/text()")
        answer = page_header[0]
        article = page_header[2]
        #print('answer_',answer,'article',article)
        user_name = tree.xpath("//span[@class='ProfileHeader-name']/text()")[0]
        collecter = tree.xpath("//div[@class='Profile-sideColumnItemValue']/text()")
        print("收藏数",collecter)
        if collecter:
            for item in collecter:
                if str(item).endswith("次收藏"):
                    save=item.strip()[:-3]
                else:
                    save = 0
        else:
            save = 0
        print(user_name, flowing,str(save))
        zhihuObj = dict(user_name=user_name, followers=follower,
                        home_page=url, collect=save,article=article,
                        answer=answer)
        try:
            self.storage_mongod(zhihuObj)
        except:
            print("数据存储错误")
コード例 #10
0
class ZHSpider():
    def __init__(self):
        self.start_url = 'https://www.zhihu.com/people/kaifulee/activities'
        self.base_url = 'https://www.zhihu.com'
        self.type = [
            'hot', 'local', 'shehui', 'guonei', 'guoji', 'recomment', 'junshi',
            'finance', 'technology', 'sports', 'fashionbang', 'fashionbang',
            'auto_moto', 'fangcan', 'technology', 'yangshengtang'
        ]
        self.SqlH = SqlHelper()
        self.SqlH.init_db('zhihu')
        self.page = 2
        self.totla_url_set = set()
        self.wait_use_url_set = set()
        self.current_type = ''

    def crawlData(self, url=None):
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap[
            "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
        #browser = webdriver.PhantomJS(desired_capabilities=dcap)
        browser = webdriver.Chrome(
            '/home/caidong/developProgram/selenium/chromedriver')
        browser.get(url)
        #print(browser.page_source)
        browser.implicitly_wait(10)
        #print(browser.page_source)
        #点击关注者
        browser.find_element_by_xpath(
            '//div[@class="NumberBoard FollowshipCard-counts"]').click()
        time.sleep(2)
        # for i in range(1,10):
        #     if i<9:
        #         bt_mouseover = browser.find_element_by_xpath('//li[@class="nav_item"]['+str(i)+']/a')
        #         actions =ActionChains(browser)
        #         actions.move_to_element(bt_mouseover).perform()
        #         browser.implicitly_wait(5)
        #         time.sleep(5)
        #         html = browser.page_source
        #         #print(html)
        #         self.current_type=self.type[i]
        #         self.parse(html)
        #     else:
        #         more = browser.find_elements_by_xpath('//div[@class="more_list"]/a')
        #         i=1
        #         for item in more:
        #             if i < 2:
        #                 bt_mouseover = browser.find_element_by_xpath('//a[@class="more"]')
        #             else:
        #                 bt_mouseover = browser.find_element_by_xpath('//a[@class="more more_current"]')
        #             i += 1
        #             actions = ActionChains(browser)
        #             actions.move_to_element(bt_mouseover).perform()
        #             time.sleep(60)
        #             browser.implicitly_wait(50)
        #             try:
        #                 item.click()
        #             except:
        #                 print ("click error")
        #             browser.implicitly_wait(15)
        #             html = browser.page_source
        #             self.current_type = self.type[i+6]
        #             print(self.current_type)
        #             #print(html)
        #             self.parse(html)
        #             #actions.click(item)
        #             time.sleep(2)

        # browser.get_screenshot_as_file('1.png')
        #print(browser.page_source)
        #exit()
        # if index <= 6:
        #     bt_element=('//div[@class="fieed-box"]/a[@id="%s"]'%xpath_str)
        # else:
        #     actions = ActionChains(browser)
        #     more = browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="more_anchor"]')
        #     actions.move_to_element(more).perform()
        #     bt_element=('//div[@class="tab-box-pop"]/a[@id="%s"]'%xpath_str)
        # #if index > 6:
        #  browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="pc_6"]').click()
        # time.sleep(2)

        #time.sleep(2)
        #actions.move_to_element(more).perform()
        # browser.find_element_by_xpath(bt_element).click()
        # time.sleep(2)
        #
        # #browser.get_screenshot_as_file('tex.png')
        # js1 = 'return document.body.scrollHeight'
        # js2 = 'window.scrollTo(0, document.body.scrollHeight)'
        # old_scroll_height = 0
        # while(browser.execute_script(js1) > old_scroll_height):
        #     old_scroll_height = browser.execute_script(js1)
        #     browser.execute_script(js2)
        #     time.sleep(0.8)
        # for i in range(self.page):
        #     load_more_xpath='//div[@class="jzgd"]/a'
        #     browser.find_element_by_xpath(load_more_xpath).click()
        self.parse(browser.page_source, url)
        print(browser.page_source)
        # try:
        more = browser.find_elements_by_xpath(
            '//button[@class="Button PaginationButton Button--plain"]')
        # except:
        #     print('没有下一页')
        # for page in range(len(more)):
        #     browser.find_elements_by_xpath('//button[@class="Button PaginationButton Button--plain"]')[page].click()
        #     time.sleep(2)
        #     self.parse_page(browser.page_source)
        #browser.find_element_by_xpath('//button[@class="Button PaginationButton PaginationButton-next Button--plain"]').click()
        ######每一次执行下一页
        total_page = more[-1].text
        print("tot", total_page)
        for curren_page in range(int(total_page)):
            try:
                browser.find_element_by_xpath(
                    '//button[@class="Button PaginationButton PaginationButton-next Button--plain"]'
                ).click()
                time.sleep(2)
            except:
                print('没有下一页')
            self.parse_page(browser.page_source)
        #exit()
        ######end

        browser.quit()

    def parse_page(self, html):
        tree = etree.HTML(html)
        followerList = tree.xpath('//div[@class="List-item"]')
        # print(followerList)
        for item in followerList:
            followerInfo = etree.ElementTree(item)
            name = followerInfo.xpath("//a[@class='UserLink-link']/text()")[0]
            home_page = followerInfo.xpath(
                "//a[@class='UserLink-link']/@href")[0]  # 主页
            follower_c = followerInfo.xpath(
                "//span[@class='ContentItem-statusItem']/text()")[2]
            # print('---------',home_page)
            if home_page and self.base_url + home_page not in self.totla_url_set:
                self.wait_use_url_set.add(self.base_url + home_page)
                self.totla_url_set.add(self.base_url + home_page)
                zhihuObj = dict(user_name=name,
                                followers=follower_c[:-3].strip(),
                                home_page=home_page,
                                collect='none')
                self.saveDB(zhihuObj, name)
            print(name, home_page, follower_c)

    def parse(self, html, url):
        tree = etree.HTML(html)
        follow = tree.xpath("//div[@class='NumberBoard-value']/text()")
        follower = follow[1]
        #print('====',follower,type(int(follower)))
        if int(follower) > 0:
            followerList = tree.xpath('//div[@class="List-item"]')
            #print(followerList)
            for item in followerList:
                followerInfo = etree.ElementTree(item)
                name = followerInfo.xpath(
                    "//a[@class='UserLink-link']/text()")[0]
                home_page = followerInfo.xpath(
                    "//a[@class='UserLink-link']/@href")[0]  #主页
                follower_c = followerInfo.xpath(
                    "//span[@class='ContentItem-statusItem']/text()")[2]
                #print('---------',home_page)
                if home_page and self.base_url + home_page not in self.totla_url_set:
                    self.wait_use_url_set.add(self.base_url + home_page)
                    self.totla_url_set.add(self.base_url + home_page)
                    zhihuObj = dict(user_name=name,
                                    followers=follower_c[:-3].strip(),
                                    home_page=home_page,
                                    collect='none')
                    self.saveDB(zhihuObj, name)
                print(name, home_page, follower_c)
        user_name = tree.xpath("//span[@class='ProfileHeader-name']/text()")[0]
        collecter = tree.xpath(
            "//div[@class='Profile-sideColumnItemValue']/text()")
        #time.sleep(2)
        flowing = follow[0]
        if collecter:
            save = collecter[2][:-3].strip()
        else:
            save = 0
        #print(save)
        #print(html)
        print(user_name, flowing, save)
        zhihuObj = dict(user_name=user_name,
                        followers=follower,
                        flowing=flowing,
                        collect=save,
                        home_page=url)
        #zhihuContent = {'user_name':user_name,'followers':follower,"flowing":flowing,"save":save,}
        if self.SqlH.count({'user_name': user_name}) == 0:
            self.SqlH.insertZhiHu(zhihuObj)
        elif self.SqlH.count({'user_name': user_name, 'collect': 'none'}):
            self.SqlH.update({'user_name': user_name}, {'collect': save})
        #print(zhihuContent)

    def saveDB(self, content, user_name):
        if self.SqlH.count({'user_name': user_name}) == 0:
            self.SqlH.insertZhiHu(content)
        elif self.SqlH.count({'user_name': user_name, 'collect': 'none'}):
            self.SqlH.update({'user_name': user_name}, {'collect': 'none'})
        pass
コード例 #11
0
ファイル: baiduCrawl.py プロジェクト: caidongHui/baiduNews
 def __init__(self):
     self.SqlH = SqlHelper()
     self.SqlH.init_db('baiduNews')
コード例 #12
0
 def __init__(self):
     self.sqlhelper = SqlHelper()
コード例 #13
0
ファイル: MongoHelp.py プロジェクト: caidongHui/baiduNews
        return results
        print(items)
        return items

    def close_client(self):
        self.client.close()

    def count(self, condition=None):
        condition = dict(condition)
        return self.collection.find(condition).count()


if __name__ == '__main__':
    from MongoHelp import MongoHelper as SqlHelper
    sqlhelper = SqlHelper()
    sqlhelper.init_db('zhihu', 'zhihu_all')
    pre = sqlhelper.count({})
    print('sum:', str(sqlhelper.count({})))
    time.sleep(10)
    now = sqlhelper.count({})
    # url = sqlhelper.select_home_url({"$and":[{"special_url":{"$exists":True}},{"special_url":{"$ne":"none"}}]},count=100,page=1)
    # print("content",url)
    # for item in url:
    #     print(item)

    #####
    # url = sqlhelper.select_home_url({"special_name":{"$exists":True}},count=100,page=1)
    # for item in url:
    #     print(item)
    #
コード例 #14
0
 def __init__(self):
     self.type=['hot','local','shehui','guonei','guoji','recomment','junshi','finance','technology','sports','fashionbang','fashionbang','auto_moto','fangcan','technology','yangshengtang']
     self.SqlH= SqlHelper()
     self.SqlH.init_db('weixin')
     self.page=2
     self.current_type=''
コード例 #15
0
class WXSpider():
    def __init__(self):
        self.type=['hot','local','shehui','guonei','guoji','recomment','junshi','finance','technology','sports','fashionbang','fashionbang','auto_moto','fangcan','technology','yangshengtang']
        self.SqlH= SqlHelper()
        self.SqlH.init_db('weixin')
        self.page=2
        self.current_type=''
    def spider(self,inde=None):
        dcap=dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = config.get_header()
        browser = webdriver.PhantomJS(desired_capabilities=dcap)
        #browser = webdriver.Chrome('/home/caidong/developProgram/selenium/chromedriver')
        browser.get('http://news.163.com/')
        #print(browser.page_source)
        for i in range(1,10):
            if i<9:
                bt_mouseover = browser.find_element_by_xpath('//li[@class="nav_item"]['+str(i)+']/a')
                actions =ActionChains(browser)
                actions.move_to_element(bt_mouseover).perform()
                browser.implicitly_wait(5)
                time.sleep(5)
                html = browser.page_source
                #print(html)
                self.current_type=self.type[i]
                self.parse(html)
            else:
                more = browser.find_elements_by_xpath('//div[@class="more_list"]/a')
                i=1
                for item in more:
                    if i < 2:
                        bt_mouseover = browser.find_element_by_xpath('//a[@class="more"]')
                    else:
                        bt_mouseover = browser.find_element_by_xpath('//a[@class="more more_current"]')
                    i += 1
                    actions = ActionChains(browser)
                    actions.move_to_element(bt_mouseover).perform()
                    time.sleep(60)
                    browser.implicitly_wait(50)
                    try:
                        item.click()
                    except:
                        print ("click error")
                    browser.implicitly_wait(15)
                    html = browser.page_source
                    self.current_type = self.type[i+6]
                    print(self.current_type)
                    #print(html)
                    self.parse(html)
                    #actions.click(item)
                    time.sleep(2)

       # browser.get_screenshot_as_file('1.png')
        #print(browser.page_source)
        #exit()
        # if index <= 6:
        #     bt_element=('//div[@class="fieed-box"]/a[@id="%s"]'%xpath_str)
        # else:
        #     actions = ActionChains(browser)
        #     more = browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="more_anchor"]')
        #     actions.move_to_element(more).perform()
        #     bt_element=('//div[@class="tab-box-pop"]/a[@id="%s"]'%xpath_str)
        # #if index > 6:
          #  browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="pc_6"]').click()
           # time.sleep(2)


        #time.sleep(2)
        #actions.move_to_element(more).perform()
        # browser.find_element_by_xpath(bt_element).click()
        # time.sleep(2)
        #
        # #browser.get_screenshot_as_file('tex.png')
        # js1 = 'return document.body.scrollHeight'
        # js2 = 'window.scrollTo(0, document.body.scrollHeight)'
        # old_scroll_height = 0
        # while(browser.execute_script(js1) > old_scroll_height):
        #     old_scroll_height = browser.execute_script(js1)
        #     browser.execute_script(js2)
        #     time.sleep(0.8)
        # for i in range(self.page):
        #     load_more_xpath='//div[@class="jzgd"]/a'
        #     browser.find_element_by_xpath(load_more_xpath).click()
        #     time.sleep(2)

        browser.quit()
    def parse(self,html):
        tree = etree.HTML(html)
        updatetime = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time()))
        news_content = tree.xpath("//div[@class='data_row news_photoview clearfix ']|//div[@class='data_row news_article clearfix ']")
        for item in news_content:
            content = etree.ElementTree(item)
            imgUrl =content.xpath("//img/@src")
            txtTitle = content.xpath("//h3/a/text()")
            detail_url = content.xpath("//h3/a/@href")
            print(imgUrl)
            print(txtTitle)
            print(detail_url)
        wxContent = {'title': txtTitle, 'url': detail_url, 'content': '',
                     'category': self.current_type,
                     'secCategory': '', 'image': imgUrl, 'time': updatetime, 'from': 'WX'}
        if self.SqlH.count({'title': txtTitle}) == 0:
            self.SqlH.insert(wxContent)
コード例 #16
0
ファイル: ExportCSV.py プロジェクト: caidongHui/baiduNews
from MongoHelp import MongoHelper as SqlHelper

import csv, time

SqlH = SqlHelper()
SqlH.init_db('zhiHu', 'zhihu_all')
headers = [
    'user_name', 'answer_comment_1', 'answer_comment_2', 'answer_comment_3',
    'article_comment_1', 'article_comment_2', 'article_comment_3', 'answer',
    'user_home_url', 'article', 'flowing', 'followers', 'collect', 'answer',
    'article'
]
con = {
    "$and": [
        {
            'article_comment': {
                "$exists": True
            }
        },
        {
            'answer_comment': {
                "$exists": True
            }
        },
        {
            'flowing': {
                "$exists": True
            }
        },
        # {'export_flag': {"$exists": False}}
    ]