class Zhihuhomepage(): def __init__(self): self.SqlH = SqlHelper() self.SqlH.init_db('zhihu') self.base_url = 'https://www.zhihu.com' def gethomepage(self,url,user_name): req = request.Request( url=url,headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' }) html=request.urlopen(req,timeout=5).read().decode('utf-8') tree = etree.HTML(html) collecter = tree.xpath("//div[@class='Profile-sideColumnItemValue']/text()") # time.sleep(2) if collecter: save = collecter[0][:-3].strip() else: save = 0 print(user_name, collecter) self.updateCollect(user_name,save) #print(html) def updateCollect(self,user_name,save): self.SqlH.update({'user_name': user_name}, {'collect': save}) def fromdb(self): tottal_s=self.SqlH.count(condition={'collect':'none'}) print(tottal_s) for c_page in range(tottal_s): time.sleep(2) result=self.SqlH.select(conditions={'collect':'none'},count=1,page=c_page) self.gethomepage(self.base_url+result[0]['home_page'],result[0]['user_name']) pass
class BaiduNews: def __init__(self): self.SqlH = SqlHelper() self.SqlH.init_db('baiduNews') def news_crawl(self): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() type = ('focus-top', 'local_news', 'guonei', 'guojie', 'caijing', 'yule', 'tiyu', 'col-auto', 'col-house', 'hulianwang', 'internet-plus', 'col-tech', 'col-edu', 'col-game', 'col-discovery', 'col-healthy', 'col-lady', 'shehui', 'junshi', 'tupianxinwen') browser = webdriver.PhantomJS() browser.get('http://news.baidu.com/') js1 = 'return document.body.scrollHeight' js2 = 'window.scrollTo(0, document.body.scrollHeight)' old_scroll_height = 0 while (browser.execute_script(js1) > old_scroll_height): old_scroll_height = browser.execute_script(js1) browser.execute_script(js2) time.sleep(0.8) html = browser.page_source tree = etree.HTML(html) updatetime = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) #print(updatetime) for item in type: regularExpressionUrl = '//div[@id="' + item + '"]//li/a/@href' regularExpressionText = '//div[@id="' + item + '"]//li/a/text()' news_url = tree.xpath(regularExpressionUrl) news_text = tree.xpath(regularExpressionText) #print('url_len'+str(len(news_url))) # print('text_len'+str(len(news_text))) for i in range(0, len(news_text)): if 'http' in news_url[i]: newsContent = { 'title': news_text[i], 'url': news_url[i], 'content': '', 'category': item, 'secCategory': '', 'image': '', 'time': updatetime, 'from': 'BD' } if self.SqlH.count({'title': news_text[i]}) == 0: self.SqlH.insert(newsContent) # 首页热点新闻模块 browser.quit()
class ZHSpider(): def __init__(self): self.black_page = 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84' self.start_url = 'https://www.zhihu.com/people/kaifulee/followers?page=25583' #self.start_url = 'https://www.zhihu.com/people/ji-da-fa-37/activities' self.base_url = 'https://www.zhihu.com' self.SqlH = SqlHelper() self.SqlH.init_db('zhihu','zhihu_48000') #self.browser = webdriver.PhantomJS() # proxy = {'address': '60.168.104.30:3128', # 'username': '******', # 'password': '******' # } # capabilities = dict(DesiredCapabilities.CHROME) # capabilities['proxy'] = {'proxyType': 'MANUAL', # 'httpProxy': proxy['address'], # 'ftpProxy': proxy['address'], # 'sslProxy': proxy['address'], # 'noProxy': '', # 'class': "org.openqa.selenium.Proxy", # 'autodetect': False} # # capabilities['proxy']['httpUsername'] = proxy['username'] # capabilities['proxy']['httpPassword'] = proxy['password'] # chromeOptions = webdriver.ChromeOptions() # chromeOptions.add_argument('--proxy-server=http://60.168.104.30:3128') #self.browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path='/home/caidong/developProgram/selenium/chromedriver') #self.browser = webdriver.PhantomJS() #cookies = ZhihuLogin().login() #print(cookies) self.browser = webdriver.PhantomJS() self.browser = webdriver.Chrome(executable_path='/home/caidong/developProgram/selenium/chromedriver') #for cookie in cookies: # self.browser.add_cookie({cookie['name']:cookie['value']}) #self.browser.add_cookie(cookie) time.sleep(5) print('cookie',self.browser.get_cookies()) #print(self.browser.get_cookies()) #self.browser.add_cookie({"cookie":'_zap=b24c85f0-aae0-456a-ba87-e0919de79409; __utma=243313742.618834370.1505397831.1505397831.1505431589.2; __utmz=243313742.1505397831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); d_c0="AJCCExnEYAyPTuiuB47mCQN_anS_LW2ZmQI=|1505432287"; q_c1=f92e81f1440d49eca643b9bd71df1d06|1505471670000|1502586350000; aliyungf_tc=AQAAABpahiv+pQIA4wmi0wpuOA0ptCdt; __utma=51854390.226003310.1505817316.1505817316.1505817316.1; __utmc=51854390; __utmz=51854390.1505817316.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=51854390.000--|3=entry_date=20170813=1; XSRF-TOKEN=2|02bd5b9f|30893afa3ad96af92f8d3ffb67906faa338d76fe308d3fb267de6cad358569a837dc39ae|1505824255; _xsrf=24ae8d1f-0dde-4510-a20d-ec7278275ab1; l_cap_id="NDYzOWZmNjBmZDhjNDBkZWI5MDg0NjYyZDk4YTk2OTA=|1505824625|220e4527cbfe214589599d071685e4c7f62143fc"; r_cap_id="NWJhOTRmYzg2NTVlNDczY2ExZWY3YzgxNGQ2ZmRmM2I=|1505824625|b050327da2a8dedc37a8e744640b60b553f3b771"; cap_id="YjcyNGZkYjFlY2JkNDU3ZWFlYmQ0NjQ3ZDJmNDcwZjk=|1505824625|5804f3f4999cf311334c3664f2e41ad2d4d93029'}) self.start_page = 48000 self.end_page = 47000 def crawlData(self, url=None): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap[ "phantomjs.page.settings.userAgent"] = config.get_header() # browser = webdriver.PhantomJS(desired_capabilities=dcap) #browser =webdriver.Firefox() self.browser.get(url) i=1 if i==1: time.sleep(30) i=i+1 # print(browser.page_source) self.browser.implicitly_wait(3) print('cookie',self.browser.get_cookies()) print(self.browser.page_source) # 点击关注者 self.browser.find_element_by_xpath('//div[@class="NumberBoard FollowshipCard-counts"]').click() self.browser.implicitly_wait(3) more = self.browser.find_elements_by_xpath('//button[@class="Button PaginationButton Button--plain"]') more[-1].click() self.browser.implicitly_wait(3) total_page = more[-1].text c_page = self.browser.find_element_by_xpath('//button[@class="Button PaginationButton PaginationButton--current Button--plain"]').text print(c_page) for curren_page in range(int(total_page)): c_page = self.browser.find_element_by_xpath( '//button[@class="Button PaginationButton PaginationButton--current Button--plain"]').text print(c_page) print('当前页:', str(curren_page)) #点击上一页 # self.browser.find_element_by_xpath( # '//button[@class="Button PaginationButton PaginationButton-prev Button--plain"]').click() #点击下一页 self.browser.find_element_by_xpath( '//Button PaginationButton PaginationButton-next Button--plain"]').click() self.browser.implicitly_wait(3) if int(c_page) < self.start_page and int(c_page) > self.end_page: try: self.loop_list() except: print('循环点击列表出错') #循环解析当前列表 def loop_list(self): items = self.browser.find_elements_by_xpath('//div[@class="ContentItem-head"]//a[@class="UserLink-link"]') print("数目",len(items)) for item in items: time.sleep(random.randrange(5)) c_url= item.get_attribute("href") print("c_url",c_url) if self.SqlH.count({"home_page":c_url+'/activities'})==0: #不出现点击点问题 while not item.is_displayed(): time.sleep(1) try: item.click() except: print('点击错误') self.browser.implicitly_wait(1) handle_cnt = len(self.browser.window_handles) - 1 # print('标签数',handle_cnt) self.browser.switch_to.window(self.browser.window_handles[handle_cnt]) print(self.browser.current_url) if self.browser.current_url==self.black_page: time.sleep(10*60) try: self.browser.implicitly_wait(3) self.parse_home_page(self.browser.page_source, self.browser.current_url) except: print("页面解析错误") if handle_cnt > 0: self.browser.close() self.browser.switch_to.window(self.browser.window_handles[0]) else: print("已存在") time.sleep(random.randrange(2)) #存储数据到mongodb def storage_mongod(self,dic): user_name =dic.get("user_name") if self.SqlH.count({'user_name': user_name}) == 0: self.SqlH.insertZhiHu(dic) else: self.SqlH.update({'user_name': user_name}, {'collect': dic.get('collect')}) pass def parse_home_page(self, html,url): tree = etree.HTML(html) follow = tree.xpath("//div[@class='NumberBoard-value']/text()") if follow: flowing = follow[0] follower = follow[1].strip() else: flowing = 'none' follower = 'none' page_header = tree.xpath("//div[@class='Card ProfileMain']//ul[@class='Tabs ProfileMain-tabs']/li[@class='Tabs-item']/a/span/text()") answer = page_header[0] article = page_header[2] #print('answer_',answer,'article',article) user_name = tree.xpath("//span[@class='ProfileHeader-name']/text()")[0] collecter = tree.xpath("//div[@class='Profile-sideColumnItemValue']/text()") print("收藏数",collecter) if collecter: for item in collecter: if str(item).endswith("次收藏"): save=item.strip()[:-3] else: save = 0 else: save = 0 print(user_name, flowing,str(save)) zhihuObj = dict(user_name=user_name, followers=follower, home_page=url, collect=save,article=article, answer=answer) try: self.storage_mongod(zhihuObj) except: print("数据存储错误")
class ZHSpider(): def __init__(self): self.start_url = 'https://www.zhihu.com/people/kaifulee/activities' self.base_url = 'https://www.zhihu.com' self.type = [ 'hot', 'local', 'shehui', 'guonei', 'guoji', 'recomment', 'junshi', 'finance', 'technology', 'sports', 'fashionbang', 'fashionbang', 'auto_moto', 'fangcan', 'technology', 'yangshengtang' ] self.SqlH = SqlHelper() self.SqlH.init_db('zhihu') self.page = 2 self.totla_url_set = set() self.wait_use_url_set = set() self.current_type = '' def crawlData(self, url=None): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap[ "phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' #browser = webdriver.PhantomJS(desired_capabilities=dcap) browser = webdriver.Chrome( '/home/caidong/developProgram/selenium/chromedriver') browser.get(url) #print(browser.page_source) browser.implicitly_wait(10) #print(browser.page_source) #点击关注者 browser.find_element_by_xpath( '//div[@class="NumberBoard FollowshipCard-counts"]').click() time.sleep(2) # for i in range(1,10): # if i<9: # bt_mouseover = browser.find_element_by_xpath('//li[@class="nav_item"]['+str(i)+']/a') # actions =ActionChains(browser) # actions.move_to_element(bt_mouseover).perform() # browser.implicitly_wait(5) # time.sleep(5) # html = browser.page_source # #print(html) # self.current_type=self.type[i] # self.parse(html) # else: # more = browser.find_elements_by_xpath('//div[@class="more_list"]/a') # i=1 # for item in more: # if i < 2: # bt_mouseover = browser.find_element_by_xpath('//a[@class="more"]') # else: # bt_mouseover = browser.find_element_by_xpath('//a[@class="more more_current"]') # i += 1 # actions = ActionChains(browser) # actions.move_to_element(bt_mouseover).perform() # time.sleep(60) # browser.implicitly_wait(50) # try: # item.click() # except: # print ("click error") # browser.implicitly_wait(15) # html = browser.page_source # self.current_type = self.type[i+6] # print(self.current_type) # #print(html) # self.parse(html) # #actions.click(item) # time.sleep(2) # browser.get_screenshot_as_file('1.png') #print(browser.page_source) #exit() # if index <= 6: # bt_element=('//div[@class="fieed-box"]/a[@id="%s"]'%xpath_str) # else: # actions = ActionChains(browser) # more = browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="more_anchor"]') # actions.move_to_element(more).perform() # bt_element=('//div[@class="tab-box-pop"]/a[@id="%s"]'%xpath_str) # #if index > 6: # browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="pc_6"]').click() # time.sleep(2) #time.sleep(2) #actions.move_to_element(more).perform() # browser.find_element_by_xpath(bt_element).click() # time.sleep(2) # # #browser.get_screenshot_as_file('tex.png') # js1 = 'return document.body.scrollHeight' # js2 = 'window.scrollTo(0, document.body.scrollHeight)' # old_scroll_height = 0 # while(browser.execute_script(js1) > old_scroll_height): # old_scroll_height = browser.execute_script(js1) # browser.execute_script(js2) # time.sleep(0.8) # for i in range(self.page): # load_more_xpath='//div[@class="jzgd"]/a' # browser.find_element_by_xpath(load_more_xpath).click() self.parse(browser.page_source, url) print(browser.page_source) # try: more = browser.find_elements_by_xpath( '//button[@class="Button PaginationButton Button--plain"]') # except: # print('没有下一页') # for page in range(len(more)): # browser.find_elements_by_xpath('//button[@class="Button PaginationButton Button--plain"]')[page].click() # time.sleep(2) # self.parse_page(browser.page_source) #browser.find_element_by_xpath('//button[@class="Button PaginationButton PaginationButton-next Button--plain"]').click() ######每一次执行下一页 total_page = more[-1].text print("tot", total_page) for curren_page in range(int(total_page)): try: browser.find_element_by_xpath( '//button[@class="Button PaginationButton PaginationButton-next Button--plain"]' ).click() time.sleep(2) except: print('没有下一页') self.parse_page(browser.page_source) #exit() ######end browser.quit() def parse_page(self, html): tree = etree.HTML(html) followerList = tree.xpath('//div[@class="List-item"]') # print(followerList) for item in followerList: followerInfo = etree.ElementTree(item) name = followerInfo.xpath("//a[@class='UserLink-link']/text()")[0] home_page = followerInfo.xpath( "//a[@class='UserLink-link']/@href")[0] # 主页 follower_c = followerInfo.xpath( "//span[@class='ContentItem-statusItem']/text()")[2] # print('---------',home_page) if home_page and self.base_url + home_page not in self.totla_url_set: self.wait_use_url_set.add(self.base_url + home_page) self.totla_url_set.add(self.base_url + home_page) zhihuObj = dict(user_name=name, followers=follower_c[:-3].strip(), home_page=home_page, collect='none') self.saveDB(zhihuObj, name) print(name, home_page, follower_c) def parse(self, html, url): tree = etree.HTML(html) follow = tree.xpath("//div[@class='NumberBoard-value']/text()") follower = follow[1] #print('====',follower,type(int(follower))) if int(follower) > 0: followerList = tree.xpath('//div[@class="List-item"]') #print(followerList) for item in followerList: followerInfo = etree.ElementTree(item) name = followerInfo.xpath( "//a[@class='UserLink-link']/text()")[0] home_page = followerInfo.xpath( "//a[@class='UserLink-link']/@href")[0] #主页 follower_c = followerInfo.xpath( "//span[@class='ContentItem-statusItem']/text()")[2] #print('---------',home_page) if home_page and self.base_url + home_page not in self.totla_url_set: self.wait_use_url_set.add(self.base_url + home_page) self.totla_url_set.add(self.base_url + home_page) zhihuObj = dict(user_name=name, followers=follower_c[:-3].strip(), home_page=home_page, collect='none') self.saveDB(zhihuObj, name) print(name, home_page, follower_c) user_name = tree.xpath("//span[@class='ProfileHeader-name']/text()")[0] collecter = tree.xpath( "//div[@class='Profile-sideColumnItemValue']/text()") #time.sleep(2) flowing = follow[0] if collecter: save = collecter[2][:-3].strip() else: save = 0 #print(save) #print(html) print(user_name, flowing, save) zhihuObj = dict(user_name=user_name, followers=follower, flowing=flowing, collect=save, home_page=url) #zhihuContent = {'user_name':user_name,'followers':follower,"flowing":flowing,"save":save,} if self.SqlH.count({'user_name': user_name}) == 0: self.SqlH.insertZhiHu(zhihuObj) elif self.SqlH.count({'user_name': user_name, 'collect': 'none'}): self.SqlH.update({'user_name': user_name}, {'collect': save}) #print(zhihuContent) def saveDB(self, content, user_name): if self.SqlH.count({'user_name': user_name}) == 0: self.SqlH.insertZhiHu(content) elif self.SqlH.count({'user_name': user_name, 'collect': 'none'}): self.SqlH.update({'user_name': user_name}, {'collect': 'none'}) pass
print(items) return items def close_client(self): self.client.close() def count(self, condition=None): condition = dict(condition) return self.collection.find(condition).count() if __name__ == '__main__': from MongoHelp import MongoHelper as SqlHelper sqlhelper = SqlHelper() sqlhelper.init_db('zhihu', 'zhihu_all') pre = sqlhelper.count({}) print('sum:', str(sqlhelper.count({}))) time.sleep(10) now = sqlhelper.count({}) # url = sqlhelper.select_home_url({"$and":[{"special_url":{"$exists":True}},{"special_url":{"$ne":"none"}}]},count=100,page=1) # print("content",url) # for item in url: # print(item) ##### # url = sqlhelper.select_home_url({"special_name":{"$exists":True}},count=100,page=1) # for item in url: # print(item) # # #
class WXSpider(): def __init__(self): self.type=['hot','local','shehui','guonei','guoji','recomment','junshi','finance','technology','sports','fashionbang','fashionbang','auto_moto','fangcan','technology','yangshengtang'] self.SqlH= SqlHelper() self.SqlH.init_db('weixin') self.page=2 self.current_type='' def spider(self,inde=None): dcap=dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = config.get_header() browser = webdriver.PhantomJS(desired_capabilities=dcap) #browser = webdriver.Chrome('/home/caidong/developProgram/selenium/chromedriver') browser.get('http://news.163.com/') #print(browser.page_source) for i in range(1,10): if i<9: bt_mouseover = browser.find_element_by_xpath('//li[@class="nav_item"]['+str(i)+']/a') actions =ActionChains(browser) actions.move_to_element(bt_mouseover).perform() browser.implicitly_wait(5) time.sleep(5) html = browser.page_source #print(html) self.current_type=self.type[i] self.parse(html) else: more = browser.find_elements_by_xpath('//div[@class="more_list"]/a') i=1 for item in more: if i < 2: bt_mouseover = browser.find_element_by_xpath('//a[@class="more"]') else: bt_mouseover = browser.find_element_by_xpath('//a[@class="more more_current"]') i += 1 actions = ActionChains(browser) actions.move_to_element(bt_mouseover).perform() time.sleep(60) browser.implicitly_wait(50) try: item.click() except: print ("click error") browser.implicitly_wait(15) html = browser.page_source self.current_type = self.type[i+6] print(self.current_type) #print(html) self.parse(html) #actions.click(item) time.sleep(2) # browser.get_screenshot_as_file('1.png') #print(browser.page_source) #exit() # if index <= 6: # bt_element=('//div[@class="fieed-box"]/a[@id="%s"]'%xpath_str) # else: # actions = ActionChains(browser) # more = browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="more_anchor"]') # actions.move_to_element(more).perform() # bt_element=('//div[@class="tab-box-pop"]/a[@id="%s"]'%xpath_str) # #if index > 6: # browser.find_element_by_xpath('//div[@class="fieed-box"]/a[@id="pc_6"]').click() # time.sleep(2) #time.sleep(2) #actions.move_to_element(more).perform() # browser.find_element_by_xpath(bt_element).click() # time.sleep(2) # # #browser.get_screenshot_as_file('tex.png') # js1 = 'return document.body.scrollHeight' # js2 = 'window.scrollTo(0, document.body.scrollHeight)' # old_scroll_height = 0 # while(browser.execute_script(js1) > old_scroll_height): # old_scroll_height = browser.execute_script(js1) # browser.execute_script(js2) # time.sleep(0.8) # for i in range(self.page): # load_more_xpath='//div[@class="jzgd"]/a' # browser.find_element_by_xpath(load_more_xpath).click() # time.sleep(2) browser.quit() def parse(self,html): tree = etree.HTML(html) updatetime = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) news_content = tree.xpath("//div[@class='data_row news_photoview clearfix ']|//div[@class='data_row news_article clearfix ']") for item in news_content: content = etree.ElementTree(item) imgUrl =content.xpath("//img/@src") txtTitle = content.xpath("//h3/a/text()") detail_url = content.xpath("//h3/a/@href") print(imgUrl) print(txtTitle) print(detail_url) wxContent = {'title': txtTitle, 'url': detail_url, 'content': '', 'category': self.current_type, 'secCategory': '', 'image': imgUrl, 'time': updatetime, 'from': 'WX'} if self.SqlH.count({'title': txtTitle}) == 0: self.SqlH.insert(wxContent)
}, { 'answer_comment': { "$exists": True } }, { 'flowing': { "$exists": True } }, # {'export_flag': {"$exists": False}} ] } # print(SqlH.count(con)) time.sleep(100) print(SqlH.count(con)) # # rows=SqlH.select_csv() # # #print(rows) # with open('zhihu_add_1.csv','w') as f: # f_csv = csv.DictWriter(f, headers) # f_csv.writeheader() # for row in rows: # print(row) # SqlH.update({"user_home_url":row["user_home_url"]},{"export_flag":"1"}) # if isinstance(row['answer_comment'],list): # print(row['answer_comment']) # i = 1