class WeiboCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel self.entityId = 'SYSU' # self.user_name = "*****@*****.**" # self.session = SessionCrawler(sleepRange=[3,8]) # self.pass_word = "810214bee810214" # self.user_name = "15088137907" # self.pass_word = "4p2yhynrb7" self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] self.logger.info('username: %s' % self.user_name) self.email = SendEmail() self.db = InsertDB() self.monitor_title = '微博爬虫监控' self.proxies = '' self.session = SessionCrawler(sleepRange=[3, 8]) # user_name_password3 = '15767199023:j980216' # user_name_password1 = '13427287354:4ova7zixzj' # user_name_password2 = '13532011721:1emr41761u' # user_name_password3 = '13640792755:1eek9uuym4' # user_name_password4 = '13697726577:7hviv4old0'#### # user_name_password5 = '13794342903:6imuw2cdya' # 197的微博账号 # user_name_password1 = '17825769929:4ms7e2v3zx' # user_name_password2 = '18211493432:7fagvqyi9p' # user_name_password3 = '17827278983:0nenzag325' # user_name_password4 = '13922771190:5aqa10wvwf' # user_name_password5 = '15999916968:2i45j5b49y' # 15119820746 - ---0htkvsq5h6 # 15986585396 - ---5gsmhx3e8k # 13430915912 - ---8s1nif2d50 # 15012471375 - ---3qwlffw8vv # 17880567972 - ---6jrlzr2fqe # 17876156948 - ---5g5w4i43f3 # 15915132451 - ---2rl2v9hy9t # 13543985544 - ---8x0pqi3as7 # 13717382951 - ---5p2d39l19r # 13640695490 - ---6nxc4vou4o def change_cookie(self): ''' 随机获取一个cookie :return: ''' # usename_list = [ # '18814095644','13432881156','*****@*****.**','15018377821','*****@*****.**', # '15767199023','13427287354','13532011721','13640792755','13794342903', # '17825769929','18211493432','17827278983','13922771190','15999916968', # '15119820746','15986585396','13430915912','15012471375','17880567972', # '17876156948','15915132451','13543985544','13717382951','13640695490', # '15711707673','13680181412','13414759320','17820956139','18476072534', # '17806699214','13418852766','17827181603','15919354070','15088137907' # ] usename_list = [ '18814095644', '13432881156', '*****@*****.**', '15018377821', '*****@*****.**', ] usename = random.choice(usename_list) return usename def get_username_password(self): ''' 随机获取用户和密码 :return: ''' user_name_password1 = '18814095644:ljda.18814095644' user_name_password2 = '13432881156:liang452035397' user_name_password3 = '[email protected]:810214bee810214' user_name_password4 = '15018377821:zzm15331411' user_name_password5 = '15767199023:j980216' user_name_password6 = '[email protected]:uwinvip' user_list = [ user_name_password1, user_name_password2, user_name_password3, user_name_password4, user_name_password5, user_name_password6 ] user_choice = random.choice(user_list) user_name_password = user_choice.split(':') return user_name_password def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' run_msg = '微博爬虫开始运行' self.db.Insert(self.channel.channel_id, self.entityId, run_msg) startTime = endTime - datetime.timedelta(hours=2) # startTime=datetime.datetime(2017,11,20,23) page = 1 articleList = list() hasnext = True while hasnext: data = self.__searchByPage(keywordList, startTime, endTime, page) (articleListInPage, hasnext) = self.__parseSearchPage(data) articleList.extend(articleListInPage) page += 1 if articleList is None: article_msg = '微博没有爬取到数据' self.email.send(self.monitor_title, article_msg) self.db.Insert(self.channel.channel_id, self.entityId, article_msg) end_msg = '微博爬虫结束' self.db.Insert(self.channel.channel_id, self.entityId, end_msg) return articleList def __searchByPage(self, keywordList, startTime, endTime, page): query = urllib.quote(' '.join(keywordList).encode('utf-8')) params = { 'typeall': '1', 'suball': '1', # 包含全部 'timescope': 'custom:%s:%s' % (startTime.strftime("%Y-%m-%d"), (endTime.strftime("%Y-%m-%d"))), # 时间 # 微博搜索的时间范围格式不同 不能写%Y-%m-%d-%H 'Refer': 'SWeibo_box', 'page': page } user_agent = random.choice(AGENTS) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', # 'Host': 's.weibo.com', # 'Referer': 'http://s.weibo.com/', 'User-Agent': user_agent } index_url = 'http://s.weibo.com/weibo/' + query # 搜索主页+ usename_cookie = self.change_cookie() self.logger.debug('Use Cookie %s' % usename_cookie) try: cookies = self.__load_cookies_from_lwp(usename_cookie) html = self.session.get(index_url, params=params, headers=headers, cookies=cookies) # 加载本地cookies lindex = html.find( '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"' ) rindex = html[lindex:].find('</script>') rindex = lindex + rindex - 1 lindex = lindex + len( '<script>STK && STK.pageletM && STK.pageletM.view(') jo = json.loads(html[lindex:rindex]) data = jo['html'] # 实时微博页 self.logger.debug('Get data') return data except Exception as e: self.logger.debug('ERROR %s' % e) loginFlag = self.__login() self.logger.debug('Use username: %s' % self.user_name) if loginFlag is False: self.logger.error('Fail to logon') login_msg = '微博登录失败' self.email.send(self.monitor_title, login_msg) self.db.Insert(self.channel.channel_id, self.entityId, login_msg) return cookies = self.__load_cookies_from_lwp(self.user_name) self.logger.debug('Get a new Cookie: %s' % cookies) html = self.session.get(index_url, params=params, headers=headers, cookies=cookies) # 加载本地cookies lindex = html.find( '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"' ) rindex = html[lindex:].find('</script>') rindex = lindex + rindex - 1 lindex = lindex + len( '<script>STK && STK.pageletM && STK.pageletM.view(') jo = json.loads(html[lindex:rindex]) data = jo['html'] # 实时微博页 return data # self.logger.warning('Crawler failed: %s' % e) # msg = '没有获取到json数据,说明微博爬虫挂了' # self.email.send(self.monitor_title,msg) # self.db.Insert(self.channel.channel_id,self.entityId,msg) def __parseSearchPage(self, data): ''' @return: (articleList,hasnext) ''' articleList = list() hasnext = False soup = BeautifulSoup(data, "lxml") self.logger.info(soup) # check if no result noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return (articleList, hasnext) # find page bar to check if more pageDivList = soup.findAll('div', {'class': 'W_pages'}) if len(pageDivList) > 0: pageDiv = pageDivList[0] if len(pageDiv.findAll('a', {'class': 'page next S_txt1 S_line1'})) > 0: hasnext = True if hasnext is False: self.logger.info('The last page') root_1 = soup.findAll('div', {"action-type": "feed_list_item"}) # self.logger.debug(root_1) for r in root_1: root_2 = r.find('div', {'class': "content clearfix"}) mid = r.attrs['mid'] article_url = root_2.find('div', { 'class': "feed_from W_textb" }).findNext('a').attrs['href'] self.logger.debug('1 %s', article_url) if not article_url.startswith('http:'): article_url = 'http:' + article_url # self.logger.debug(article_url) root_content = root_2.find('p', {'class': "comment_txt"}) long_content = root_content.find('a', {'action-type': "fl_unfold"}) try: link_content = root_content.find('a').attrs['href'] link_content = ' 原文链接: ' + link_content except: link_content = '' if long_content: content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[ 'action-data'] self.session.randomSleep() response = self.session.get(content_url, textRspOnly=False) try: content_html = response.json()['data']['html'] content = BeautifulSoup( content_html, 'html.parser').text.strip().replace( "'", "''").replace("%", "\%").replace(":", "\:") except Exception, e: self.logger.debug('Exception: %s' % e) continue else: content = root_content.text.strip().replace("'", "''").replace( "%", "\%").replace(":", "\:") content = content + link_content # self.logger.error(content) title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' author_id = r.attrs['tbinfo'] author_id = re.findall(r'ouid=(\d+)', author_id)[0] author_name = root_2.find('a').attrs['nick-name'] publish_datetime = root_2.find('a', { 'class': "W_textb" }).attrs['date'] try: publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(publish_datetime) / 1000)) except: continue article = Article(mid, self.channel.channel_id, title, content, publish_datetime, url=article_url, author_id=author_id, author_name=author_name) # fetch statistics root_3 = r.find('div', {'class': "feed_action clearfix"}) soup_li_list = root_3.findAll('li') self.__parseStatistics(article, soup_li_list) # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \ # publish_datetime, reply_count, read_count, like_count, collect_count, forward_count if article not in articleList: articleList.append(article) return (articleList, hasnext)
class WeiboCrawler(object): ''' classdocs ''' def __init__(self, channel=None, logger=None): ''' Constructor ''' # if logger is None: # # self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) # # else: # # self.logger = logger self.logger = logging.getLogger() self.channel = channel self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] # self.logger.info('username: %s' % self.user_name) print 'username: %s' % self.user_name self.session = SessionCrawler(sleepRange=[3, 8]) def get_username_password(self): ''' 随机获取用户和密码 :return: ''' # 超算1的微博账号 # user_name_password1 = '18814095644:ljda.18814095644' # user_name_password2 = '13432881156:liang452035397' # user_name_password3 = '[email protected]:810214bee810214' # user_name_password4 = '15018377821:zzm15331411' # user_name_password5 = '[email protected]:uwinvip' # 超算2的微博账号 # user_name_password1 = '13432881156:liang452035397' # user_name_password2 = '[email protected]:uwinvip' # user_name_password3 = '15767199023:j980216' # user_name_password1 = '13427287354:4ova7zixzj' # user_name_password2 = '13532011721:1emr41761u' # user_name_password3 = '13640792755:1eek9uuym4' # user_name_password4 = '13697726577:7hviv4old0' # user_name_password5 = '13794342903:6imuw2cdya' # 197的微博账号 user_name_password1 = '17825769929:4ms7e2v3zx' user_name_password2 = '18211493432:7fagvqyi9p' user_name_password3 = '17827278983:0nenzag325' user_name_password4 = '13922771190:5aqa10wvwf' user_name_password5 = '15999916968:2i45j5b49y' user_list = [ user_name_password1, user_name_password2, user_name_password3, user_name_password4, user_name_password5 ] # user_list = [user_name_password1, user_name_password2, user_name_password3] user_choice = random.choice(user_list) user_name_password = user_choice.split(':') return user_name_password # 随机选取User-Agent def get_random_agent(self): user_agent = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; Trident/5.0)", ] random_header = random.choice(user_agent) return random_header def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' self.__login() # startTime = endTime - datetime.timedelta(hours=2) # # startTime=datetime.datetime(2017,11,20,23) # page = 1 # articleList = list() # hasnext = True # while hasnext: # data = self.__searchByPage(keywordList, startTime,endTime, page) # (articleListInPage,hasnext) = self.__parseSearchPage(data) # # articleList.extend(articleListInPage) # page+=1 # return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' # return try: (data, check) = self.__fetchSingleArticle(article) if check == '0': soup = BeautifulSoup(data, 'lxml') ulList = soup.findAll( 'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'}) li_list = ulList[0].findAll('li') self.__parseStatistics(article, li_list) elif check == '1': self.logger.warning(u'要访问的网页404了:%s', article.url) return else: self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url) return except: self.logger.error('Fail to fetch statistics for:%s, %s', article.url, traceback.format_exc()) return def __save_cookies_lwp(self, cookiejar): """ 保存cookies到本地 """ filename = 'sinaweibocookies' lwp_cookiejar = cookielib.LWPCookieJar() for c in cookiejar: args = dict(vars(c).items()) args['rest'] = args['_rest'] del args['_rest'] c = cookielib.Cookie(**args) lwp_cookiejar.set_cookie(c) lwp_cookiejar.save(filename, ignore_discard=True) def __load_cookies_from_lwp(self): """ 读取本地cookies """ filename = 'sinaweibocookies' lwp_cookiejar = cookielib.LWPCookieJar() lwp_cookiejar.load(filename, ignore_discard=True) return lwp_cookiejar def __parseStatistics(self, article, soup_li_list): # 新版 collect_count = soup_li_list[0].find('span').text collect_count = re.findall(r'\d+', collect_count) if len(collect_count) > 0: collect_count = int(collect_count[0]) else: collect_count = 0 forward_count = soup_li_list[1].find('span').text forward_count = re.findall(r'\d+', forward_count) if len(forward_count) > 0: forward_count = int(forward_count[0]) else: forward_count = 0 reply_count = soup_li_list[2].find('span').text reply_count = re.findall(r'\d+', reply_count) if len(reply_count) > 0: reply_count = int(reply_count[0]) else: reply_count = 0 like_count = soup_li_list[3].find('span').text like_count = re.findall(r'\d+', like_count) if len(like_count) > 0: like_count = int(like_count[0]) else: like_count = 0 article.statistics.reply_count = reply_count article.statistics.like_count = like_count article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count def __parseSearchPage(self, data): ''' @return: (articleList,hasnext) ''' articleList = list() hasnext = False soup = BeautifulSoup(data, "lxml") # self.logger.info(soup) #check if no result noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return (articleList, hasnext) #find page bar to check if more pageDivList = soup.findAll('div', {'class': 'W_pages'}) if len(pageDivList) > 0: pageDiv = pageDivList[0] if len(pageDiv.findAll('a', {'class': 'page next S_txt1 S_line1'})) > 0: hasnext = True if hasnext is False: self.logger.info('The last page') root_1 = soup.findAll('div', {"action-type": "feed_list_item"}) # self.logger.debug(root_1) for r in root_1: root_2 = r.find('div', {'class': "content clearfix"}) mid = r.attrs['mid'] article_url = root_2.find('div', { 'class': "feed_from W_textb" }).findNext('a').attrs['href'] self.logger.debug('1 %s', article_url) if not article_url.startswith('http:'): article_url = 'http:' + article_url # self.logger.debug(article_url) root_content = root_2.find('p', {'class': "comment_txt"}) long_content = root_content.find('a', {'action-type': "fl_unfold"}) try: link_content = root_content.find('a').attrs['href'] link_content = ' 原文链接: ' + link_content except: link_content = '' if long_content: content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[ 'action-data'] self.session.randomSleep() response = self.session.get(content_url, textRspOnly=False) try: content_html = response.json()['data']['html'] content = BeautifulSoup( content_html, 'html.parser').text.strip().replace( "'", "''").replace("%", "\%").replace(":", "\:") except Exception, e: self.logger.debug('Exception: %s' % e) continue else: content = root_content.text.strip().replace("'", "''").replace( "%", "\%").replace(":", "\:") content = content + link_content # self.logger.error(content) title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' author_id = r.attrs['tbinfo'] author_id = re.findall(r'ouid=(\d+)', author_id)[0] author_name = root_2.find('a').attrs['nick-name'] publish_datetime = root_2.find('a', { 'class': "W_textb" }).attrs['date'] try: publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(publish_datetime) / 1000)) except: continue article = Article(mid, self.channel.channel_id, title, content, publish_datetime, url=article_url, author_id=author_id, author_name=author_name) #fetch statistics root_3 = r.find('div', {'class': "feed_action clearfix"}) soup_li_list = root_3.findAll('li') self.__parseStatistics(article, soup_li_list) if article not in articleList: articleList.append(article) return (articleList, hasnext)
class WechatCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.session_public = SessionCrawler(logger=self.logger) self.session_public_article = SessionCrawler(logger=self.logger) self.channel = channel self.entityId = 'SYSU' self.ip_list = None self.proxies = None self.monitor_title = '微信爬虫监控' self.email = SendEmail() self.db = InsertDB() def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,爬取一天内的文章 @param keywordList: 关键字数组 @:param endTime: 搜索结束时间 ''' run_msg = '微信爬虫开始运行' self.db.Insert(self.channel.channel_id, self.entityId, run_msg) startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeStr = startTime.strftime('%Y-%m-%d') endTimeStr = endTime.strftime('%Y-%m-%d') # startTime = endTime - datetime.timedelta(days=1) # startTimeStr = startTime.strftime('%Y-%m-%d') # endTimeStr=startTime.strftime('%Y-%m-%d') self.logger.debug('startTime:%s', startTimeStr) self.logger.debug('endTime:%s', endTimeStr) # 随机选取一个代理 # proxy_crawler = ProxyCrawler() # proxies = proxy_crawler.get_random_proxy() # publicList = self.getPublic() # articleList = self.searchPublic(publicList) articleList = list() urlList = list() for keyword in keywordList: # 忽略第一次,第一次不带时间范围 pageUrl = (SOUGO_WEIXIN_URL_INIT % (urllib.quote( keyword.encode('utf-8')), int(time.time() * 1000))).replace( '#', '%') self.logger.debug('pageUrl:%s', pageUrl) self.session.randomSleep() lastPageUrl = pageUrl # 爬取微信一天时间的内容 pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote( keyword.encode('utf-8')), startTimeStr, endTimeStr) self.logger.debug('pageUrl:%s', pageUrl) # 得到ip队列 self.ip_list = getIp() ip = self.ip_list.dequeue() self.proxies = {"http": "http://" + ip} while True: # proxies = {"http": "http://" + ip} headers = HEADERS_SOGOU.copy() headers['Referer'] = lastPageUrl try: response = self.session.get(pageUrl, allow_redirects=False, headers=headers, proxies=self.proxies) soup = BeautifulSoup(response, 'lxml') main = soup.find('ul', {'class': "news-list"}) while True: if main is None: # self.logger.error('Fail to parse: ip被封,更新ip') content = 'ip被封,更新ip' # self.email.send(self.monitor_title, content) self.db.Insert(self.channel.channel_id, self.entityId, content) temp = self.ip_list.dequeue() if self.ip_list.isempty(): self.ip_list = getIp() self.proxies = {"http": "http://" + temp} # while True: # try: response = self.session.get(pageUrl, allow_redirects=False, headers=headers, proxies=self.proxies) soup = BeautifulSoup(response, 'lxml') main = soup.find('ul', {'class': "news-list"}) # break # except: # ip_unuseful_content = '此ip是不合格的ip,更新ip' # # self.email.send(self.monitor_title, ip_unuseful_content) # self.db.Insert(self.channel.channel_id,self.entityId,ip_unuseful_content) # tmp = self.ip_list.dequeue() # if self.ip_list.isempty(): # self.ip_list = getIp() # self.proxies = {"http": "http://" + tmp} else: break li_list = main.findAll('li') # li_list有可能为空,但还可以翻页 for li in li_list: a_list = li.findAll('a') try: publish_datetime = li.select_one('.s-p').get('t') publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(publish_datetime))) except Exception as e: self.logger.debug( 'Publish_datetime crawl failed, use now time') publish_datetime = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M:%S') print publish_datetime for a in a_list: if a['uigs'].startswith('article_title'): # self.logger.debug('Article title:%s',a.text) urlList.append((a['href'], pageUrl, a.text, publish_datetime)) break pageBarList = soup.findAll('div', {'id': 'pagebar_container'}) if len(pageBarList) == 0: # 没有翻页,直接退出 break pageBar = pageBarList[0] aList = pageBar.findAll('a') foundNextPage = False for a in aList: if a['uigs'] == 'page_next': foundNextPage = True lastPageUrl = pageUrl pageUrl = SOGOU_URL + a['href'] self.logger.debug('Found next page:%s', a.text) break if foundNextPage is False: break except: ip_unuseful_content = '此ip是不合格的ip,更新ip' # self.email.send(self.monitor_title,ip_unuseful_content) self.db.Insert(self.channel.channel_id, self.entityId, ip_unuseful_content) tmp = self.ip_list.dequeue() if self.ip_list.isempty(): self.ip_list = getIp() self.proxies = {"http": "http://" + tmp} for item in urlList: article = self.crawlArticle(item[0], referer=item[1], title=item[2], publish_datetime=item[3], flag=0) if article is not None: if article not in articleList: # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) if articleList is None: monitor_content = '微信没有数据,或者微信爬虫挂了' self.email.send(self.monitor_title, monitor_content) self.db.Insert(self.channel.channel_id, self.entityId, monitor_content) end_msg = '微信爬虫结束' self.db.Insert(self.channel.channel_id, self.entityId, end_msg) return articleList def searchPublic(self, keywordList): ''' 根据关键字数组,开始时间和结束时间范围搜索公众号 :param keywordList: :param endTime: :return: ''' articleList = list() for keyword in keywordList: self.logger.debug(keyword) pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword[0]) self.logger.info('pageUrl:%s', pageUrl) self.session.randomSleep() response = self.session.get(pageUrl, textRspOnly=False, headers=HEADERS_SOGOU) soup = BeautifulSoup(response.text, 'lxml') main = soup.find('ul', {'class': "news-list2"}) if main is None: self.logger.error('Fail to parse:%s', response.text) try: li_list = main.findAll('li') except Exception, e: print e continue for li in li_list: a_title = li.find('p', {'class': "tit"}) if a_title is not None: title = str(a_title.text.strip()) if title == keyword[0]: self.logger.debug(title) a_href = a_title.find('a')['href'] sub_articleList = self.crawlWetchartpublic(a_href) for article in sub_articleList: articleList.append(article) return articleList
class WeiXinCrawlerByTopic(object): def __init__(self, sleepRange, logger): ''' Constructor ''' self.logger = logger self.session = SessionCrawler(None, sleepRange, self.logger) def __querySogou(self, sougoUrl): ''' Given the official account id, we expect to uniquely find one and only one item Return the url to the official account ''' self.logger.debug('Query sougo %s', sougoUrl) content = self.session.get(sougoUrl, SOUGO_HEADER) soup = BeautifulSoup(content) ul = soup.findAll('ul',{'class':'news-list'})[0] liList = ul.findAll('li') for li in liList: aList = li.findAll('a') articleUrl = None for a in aList: if a['uigs'].startswith('article_title'): articleUrl = a['href'] break if articleUrl is not None: header = WX_HEADER.copy() header['Referer'] = sougoUrl self.session.randomSleep() content = self.session.get(articleUrl, header) article = self.parseArticle(content) article.contentUrl = articleUrl queryComment(self.session, articleUrl, article) def parseArticle(self, content): soup = BeautifulSoup(content) article = Article() #content div = soup.findAll('div',{'class':'rich_media_content'}) if len(div)==0: #it may be due to that this post has been banned self.logger.warn('No content') return None #raise Exception('No content for %s'% article.title) article.content = div[0].text #title in <title> ... </title> title = soup.findNext('title') article.title = title.text article.wid = hash(article.title) #find meta list divMeta = soup.findAll('div',{'class':'rich_media_meta_list'})[0] #post date emPostdate = divMeta.findAll('em',{'id':'post-date'})[0] article.publishDateTime = time.mktime(datetime.datetime.strptime(emPostdate.text, '%Y-%m-%d').timetuple()) #author emAuthorList = divMeta.findAll('em',{'class':'rich_media_meta rich_media_meta_text'}) for em in emAuthorList: if 'id' not in em.attrs: article.author = em.text break #profile divProfile = divMeta.findAll('div',{'class':'profile_inner'})[0] ##nickname strong = divProfile.findAll('strong',{'class':'profile_nickname'})[0] article.userName = strong.text ##userid article.userId = strong.findNext('span').text return article
class WeiboCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel # self.user_name = "*****@*****.**" # self.session = SessionCrawler(sleepRange=[3,8]) # self.pass_word = "810214bee810214" # self.user_name = "15018377821" # self.pass_word = "zzm15331411" self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] self.logger.info('username: %s' % self.user_name) self.session = SessionCrawler(sleepRange=[3, 8]) def get_username_password(self): ''' 随机获取用户和密码 :return: ''' user_name_password1 = '18814095644:ljda.18814095644' user_name_password2 = '13432881156:liang452035397' user_name_password3 = '[email protected]:810214bee810214' user_name_password4 = '15018377821:zzm15331411' user_name_password5 = '[email protected]:uwinvip' user_name_password6 = '15767199023:j980216' user_list = [ user_name_password1, user_name_password2, user_name_password3, user_name_password4, user_name_password5, user_name_password6 ] user_choice = random.choice(user_list) user_name_password = user_choice.split(':') return user_name_password def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' # time_now = time.time() # date = time.strftime('%Y-%m-%d', time.localtime(time.time())) # com_time_low = time.mktime(time.strptime(date + ' 07:00:00', '%Y-%m-%d %H:%M:%S')) # com_time_hig = time.mktime(time.strptime(date + ' 07:59:58', '%Y-%m-%d %H:%M:%S')) # if time_now > com_time_low and time_now < com_time_hig: # loginFlag = self.__login() # if loginFlag is False: # self.logger.error('Fail to logon') # return startTime = endTime - datetime.timedelta(hours=2) # startTime=datetime.datetime(2017,11,20,23) page = 1 articleList = list() hasnext = True while hasnext: data = self.__searchByPage(keywordList, startTime, endTime, page) (articleListInPage, hasnext) = self.__parseSearchPage(data) articleList.extend(articleListInPage) page += 1 return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' # return try: (data, check) = self.__fetchSingleArticle(article) if check == '0': soup = BeautifulSoup(data, 'lxml') ulList = soup.findAll( 'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'}) li_list = ulList[0].findAll('li') self.__parseStatistics(article, li_list) elif check == '1': self.logger.warning(u'要访问的网页404了:%s', article.url) return else: self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url) return except: self.logger.error('Fail to fetch statistics for:%s, %s', article.url, traceback.format_exc()) return def __save_cookies_lwp(self, cookiejar): """ 保存cookies到本地 """ filename = 'sinaweibocookies' lwp_cookiejar = cookielib.LWPCookieJar() for c in cookiejar: args = dict(vars(c).items()) args['rest'] = args['_rest'] del args['_rest'] c = cookielib.Cookie(**args) lwp_cookiejar.set_cookie(c) lwp_cookiejar.save(filename, ignore_discard=True) def __load_cookies_from_lwp(self): """ 读取本地cookies """ filename = 'sinaweibocookies' lwp_cookiejar = cookielib.LWPCookieJar() lwp_cookiejar.load(filename, ignore_discard=True) # self.logger.debug(lwp_cookiejar) return lwp_cookiejar def __parseStatistics(self, article, soup_li_list): # 新版 collect_count = soup_li_list[0].find('span').text collect_count = re.findall(r'\d+', collect_count) if len(collect_count) > 0: collect_count = int(collect_count[0]) else: collect_count = 0 forward_count = soup_li_list[1].find('span').text forward_count = re.findall(r'\d+', forward_count) if len(forward_count) > 0: forward_count = int(forward_count[0]) else: forward_count = 0 reply_count = soup_li_list[2].find('span').text reply_count = re.findall(r'\d+', reply_count) if len(reply_count) > 0: reply_count = int(reply_count[0]) else: reply_count = 0 like_count = soup_li_list[3].find('span').text like_count = re.findall(r'\d+', like_count) if len(like_count) > 0: like_count = int(like_count[0]) else: like_count = 0 article.statistics.reply_count = reply_count article.statistics.like_count = like_count article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count def __parseSearchPage(self, data): ''' @return: (articleList,hasnext) ''' articleList = list() hasnext = False soup = BeautifulSoup(data, "lxml") # self.logger.info(soup) #check if no result noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return (articleList, hasnext) #find page bar to check if more pageDivList = soup.findAll('div', {'class': 'W_pages'}) if len(pageDivList) > 0: pageDiv = pageDivList[0] if len(pageDiv.findAll('a', {'class': 'page next S_txt1 S_line1'})) > 0: hasnext = True if hasnext is False: self.logger.info('The last page') root_1 = soup.findAll('div', {"action-type": "feed_list_item"}) # self.logger.debug(root_1) for r in root_1: root_2 = r.find('div', {'class': "content clearfix"}) mid = r.attrs['mid'] article_url = root_2.find('div', { 'class': "feed_from W_textb" }).findNext('a').attrs['href'] self.logger.debug('1 %s', article_url) if not article_url.startswith('http:'): article_url = 'http:' + article_url # self.logger.debug(article_url) root_content = root_2.find('p', {'class': "comment_txt"}) long_content = root_content.find('a', {'action-type': "fl_unfold"}) try: link_content = root_content.find('a').attrs['href'] link_content = ' 原文链接: ' + link_content except: link_content = '' if long_content: content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[ 'action-data'] self.session.randomSleep() response = self.session.get(content_url, textRspOnly=False) try: content_html = response.json()['data']['html'] content = BeautifulSoup( content_html, 'html.parser').text.strip().replace( "'", "''").replace("%", "\%").replace(":", "\:") except Exception, e: self.logger.debug('Exception: %s' % e) continue else: content = root_content.text.strip().replace("'", "''").replace( "%", "\%").replace(":", "\:") content = content + link_content # self.logger.error(content) title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' author_id = r.attrs['tbinfo'] author_id = re.findall(r'ouid=(\d+)', author_id)[0] author_name = root_2.find('a').attrs['nick-name'] publish_datetime = root_2.find('a', { 'class': "W_textb" }).attrs['date'] try: publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(publish_datetime) / 1000)) except: continue article = Article(mid, self.channel.channel_id, title, content, publish_datetime, url=article_url, author_id=author_id, author_name=author_name) #fetch statistics root_3 = r.find('div', {'class': "feed_action clearfix"}) soup_li_list = root_3.findAll('li') self.__parseStatistics(article, soup_li_list) # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \ # publish_datetime, reply_count, read_count, like_count, collect_count, forward_count if article not in articleList: articleList.append(article) return (articleList, hasnext)
class SinanewsBlogCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(sleepRange=[3, 8]) self.channel = channel self.articleList = list() self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'search.sina.com.cn', 'Upgrade-Insecure-Requests': '1' } def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta(hours=1) # startTime=datetime.datetime(2017,11,20,23) page = 0 hasnext = True while hasnext: hasnext = self.__searchByPage(keywordList, startTime, endTime, page) page += 1 self.logger.error(u'articlelength:%d', len(self.articleList)) return self.articleList def __searchByPage(self, keywordList, startTime, endTime, page): # 在豆瓣内部搜索框搜索 page = str(page) #url接收的是str格式 query = urllib.quote(' '.join(keywordList).encode('utf-8')) params = { 'c': 'blog', 'range': 'article', # 包含全部 'by': 'all', 'sort': 'time', 'col': '', 'source': '', 'from': '', 'country': '', 'size': '', 'time': '', 'a': '', 'isown': 'false', 'page': page, 'dpc': '', 'q': query } search_url = "http://search.sina.com.cn/" url_page = self.session.get(search_url, params=params, headers=self.headers) soup = BeautifulSoup(url_page, "lxml") # self.logger.debug(soup) main_wrap = soup.find('div', attrs={'class': "result-boxes"}) if main_wrap is None: #为了防止因网络问题而导致的查找不到 self.logger.debug(u'第一次查找没有结果再找一遍中') url_page = self.session.get(search_url, params=params, headers=self.headers) soup = BeautifulSoup(url_page, "lxml") main_wrap = soup.find('div', attrs={'class': "result-boxes"}) if main_wrap is None: #再找一遍如果还是没有才可以算没有结果 self.logger.debug(u'第二次查找没有结果,该关键词没有结果') return (list(), False) main_article_list = main_wrap.findAll( 'div', attrs={'class': "box-result clearfix"}) hasnext = True if main_article_list is not None: print '1' for title in main_article_list: print '2' article_publishtime_wrap = title.findAll('p')[1] article_publishtime = article_publishtime_wrap.find( 'span', attrs={ 'class': "fgray_time" }).text.strip() self.logger.error(article_publishtime) urlTime = time.strptime(article_publishtime, "%Y-%m-%d %H:%M:%S") Y, M, D, H = urlTime[0:4] urlTime2 = datetime.datetime(Y, M, D, H) urlTime2 = time.mktime(urlTime2.timetuple()) #转换成时间戳来比较 float类型 startTime = endTime - datetime.timedelta(days=5) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) #如果符合时间范围就爬取 if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond: print '3' title_wrap = title.find('h2') article_url_parent = title_wrap.find('a').attrs['href'] article_url_index = article_url_parent.rfind('/') article_url_child = article_url_parent[article_url_index + 1:] article_url = 'http://blog.sina.com.cn/s/blog_' + article_url_child + '.html' Ttitle = title_wrap.find('a').text.strip() self.logger.debug(article_url) meta_info = article_url_child (content, reply_count, like_count, read_count, collect_count, forward_count) = self.crawlArticle( article_url, meta_info) self.logger.debug(like_count) if content is None: # 话题已被删除或则其他格式 print '756236' continue tid = 't_' + article_url_child author_name = article_publishtime_wrap.find( 'a', attrs={ 'class': "rib-author" }).text article = Article(tid, self.channel.channel_id, Ttitle, content, article_publishtime, article_url, None, author_name, meta_info=meta_info) # self.crawlComment(article) article.statistics.reply_count = reply_count article.statistics.like_count = like_count article.statistics.read_count = read_count article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count # self.logger.debug(article) if article not in self.articleList: self.articleList.append(article) else: print '78956' print len(self.articleList) hasnext = False break return hasnext def crawlStatistics(self, article): #下次直接获得要统计的变量而不用爬整个网页 ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' headers2 = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'comet.blog.sina.com.cn', 'Upgrade-Insecure-Requests': '1' } try: # 统计数据的请求 parmas = { 'maintype': 'num', 'aids': article.meta_info[-6:], 'uid': article.meta_info[:-8] } data_url = 'http://comet.blog.sina.com.cn/api' data = self.session.get(data_url, params=parmas, headers=headers2) data2 = json.loads(data[data.find(':') + 1:-3]) like_count = data2['d'] read_count = data2['r'] reply_count = data2['c'] collect_count = data2['f'] forward_count = data2['z'] article.statistics.reply_count = reply_count article.statistics.like_count = like_count article.statistics.read_count = read_count article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count except: self.logger.error(u'该话题已被删除或有其他格式') def crawlArticle(self, url, meta_info): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'blog.sina.com.cn', 'Upgrade-Insecure-Requests': '1' } headers2 = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'comet.blog.sina.com.cn', 'Upgrade-Insecure-Requests': '1' } html = self.session.get(url, headers=headers) if html: print '111' soup = BeautifulSoup(html, 'lxml') # 'html.parser' 解析器 main = soup.find('div', attrs={'id': "articlebody"}) if main is not None: print '222' Tcontent = main.find('div', attrs={ 'id': "sina_keyword_ad_area2" }).text.strip() #统计数据的请求 parmas = { 'maintype': 'num', 'aids': meta_info[-6:], 'uid': meta_info[:-8] } data_url = 'http://comet.blog.sina.com.cn/api' data = self.session.get(data_url, params=parmas, headers=headers2) data2 = json.loads(data[data.find(':') + 1:-3]) like_count = data2['d'] read_count = data2['r'] reply_count = data2['c'] collect_count = data2['f'] forward_count = data2['z'] return (Tcontent, reply_count, like_count, read_count, collect_count, forward_count) else: self.logger.error(u'该话题已被删除或存在其他格式') return (None, None, None, None, None, None) def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.lastCommentId = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' #self.logger.debug('Article:%s', article) comment_url = 'http://comment5.news.sina.com.cn/page/info?channel=blog&newsid=' comment_url = comment_url + article.meta_info self.logger.error(comment_url) html = self.session.get(comment_url) add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") commentList = list() main_comment = json.loads(html)['result']['cmntlist'] print '8989' if len(main_comment) == 0: print '12212' return (commentList, False) for comment in main_comment: cid = comment['mid'] publish_datetime = comment['time'] user_id = comment['uid'] user_name = comment['nick'] content = comment['content'] location = ','.join(jieba.cut(comment['area'])) location_region = location[:location.find(',')] location_city = location[location.find(',') + 1:] print location_city location_coutry = 'CN' commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, None, location_coutry, location_region, location_city, user_id, user_name, content, None, None, None, None, dislike_count=None)) return (commentList, False ) #测试的时候 article[0][222].content 可以取出第222条的评论内容
class WeiXinCralwer(object): ''' classdocs ''' def __init__(self, sleepRange, logger): ''' Constructor ''' self.logger = logger self.session = SessionCrawler(None, sleepRange, self.logger) def __querySogou(self, sougoUrl): ''' Given the official account id, we expect to uniquely find one and only one item Return the url to the official account ''' self.logger.debug('Query sougo %s', sougoUrl) content = self.session.get(sougoUrl, SOUGO_HEADER) soup = BeautifulSoup(content) item = soup.findAll('a',{'uigs':'account_name_0'})[0] return item['href'] def __queryArticleList(self, sougoUrl, officialAccountUrl): self.logger.debug('Query ariticle list for %s', officialAccountUrl) header = WX_HEADER.copy() header['Referer'] = sougoUrl self.session.randomSleep() content = self.session.get(officialAccountUrl, header) lindex = content.find(WX_ARTICLE_LIST_PREFIX)+len(WX_ARTICLE_LIST_PREFIX) rindex = content.find(WX_ARTICLE_LIST_SUFFIX) rindex = lindex + content[lindex:rindex].rfind(';') js = content[lindex:rindex] jo = json.loads(js) aList = jo['list'] articleList = list() for item in aList: app_msg_ext_info = item['app_msg_ext_info'] comm_msg_info = item['comm_msg_info'] article = self.__fetchArticle(app_msg_ext_info) article.publishDateTime = comm_msg_info['datetime'] articleList.append(article) if 'multi_app_msg_item_list' in item: for embedItem in item['multi_app_msg_item_list']: article = self.__fetchArticle(embedItem) article.publishDateTime = comm_msg_info['datetime'] articleList.append(article) return articleList def __fetchArticle(self, item): article = Article() article.title = item['title'] article.wid = str(hash(article.title)) article.author = item['author'] article.contentUrl = item['content_url'] article.digest = item['digest'] article.fileid = item['fileid'] article.sourceUrl = item['source_url'] #print article.title,":",article.contentUrl,'\n' return article def __queryComment(self, articleList, referer): mainPageHeader = WX_HEADER.copy() mainPageHeader['Referer'] = referer for article in articleList: self.logger.debug('Query comment for %s', article.title) #find the signature lindex= article.contentUrl.find('signature=') rindex = article.contentUrl[lindex:].find('&') if rindex>0: signature = article.contentUrl[lindex:rindex] else: signature = article.contentUrl[lindex:] #find the timestamp lindex= article.contentUrl.find('timestamp=')+len('timestamp=') timestamp = int(article.contentUrl[lindex:lindex+10]) self.session.randomSleep() #query main page mainUrl = 'http://mp.weixin.qq.com'+article.contentUrl.replace('&','&') self.session.randomSleep() content = self.session.get(mainUrl, mainPageHeader) soup = BeautifulSoup(content) div = soup.findAll('div',{'class':'rich_media_content'}) if len(div)==0: #it may be due to that this post has been banned self.logger.warn('No content for %s', article.title) continue #raise Exception('No content for %s'% article.title) article.content = div[0].text #query comment page currentTime = int(time.time()) url = WX_COMMENT_URL % (timestamp, signature) #print url header = WX_COMMENT_HEADER.copy() header['Referer'] = mainUrl self.session.randomSleep() content = self.session.get(url, header) jo = json.loads(content) #print jo.keys() article.readCount = jo['read_num'] article.likeCount = jo['like_num'] commentList = jo['comment'] for item in commentList: comment = Comment() comment.commenterNickName = item['nick_name'] comment.likeCount = item['like_num'] comment.content = item['content'] comment.contentId = item['content_id'] comment.createTime = item['create_time'] for replyItem in item['reply']['reply_list']: reply = Reply() reply.content = replyItem['content'] reply.createTime = replyItem['create_time'] reply.uin = replyItem['uin'] reply.toUin = replyItem['to_uin'] reply.replyId = replyItem['reply_id'] comment.replyList.append(reply) article.commentList.append(comment) def crawl(self, officialAccountId): sougoUrl = SOUGO_QUERY_URL % (1, quote(officialAccountId)) officialAccountUrl = self.__querySogou(sougoUrl) articleList = self.__queryArticleList(sougoUrl, officialAccountUrl) self.__queryComment(articleList, officialAccountUrl) return articleList #self.__writeCsv(officialAccountId+'.csv', articleList) def writeDb(self, dbConf, officialAccountId, articleList): dbProxy = MySqlProxy(host=dbConf['dbHost'], port=3306, user=dbConf['dbUser'], passwd=dbConf['dbPasswd'], db=dbConf['dbName']) weixinSql = 'INSERT INTO T_WEIXIN (pid, wid, author, title, digest, content, publish_datetime, read_count, like_count) values ' commentSql = 'INSERT INTO T_WEIXIN_COMMENT(pid, cid, wid, content, publisher_name, publish_datetime,like_count) values ' replySql = 'INSERT INTO T_WEIXIN_REPLY (rid, cid, content, publish_datetime, uin, touin) values ' weixinValueList = list() commentValueList = list() replyValueList = list() widSet = set() for article in articleList: weixinValueList.append('("%s","%s","%s","%s","%s","%s","%s",%d,%d)'%( officialAccountId, str(article.wid), article.author.replace('"','\\"'), article.title.replace('"','\\"'), article.digest.replace('"','\\"'), article.content.replace('"','\\"'), time.strftime(ISOTIMEFORMAT, time.localtime(article.publishDateTime)), article.readCount, article.likeCount )) widSet.add(article.fileid) for comment in article.commentList: commentValueList.append('("%s","%s","%s","%s","%s","%s",%d)'%( officialAccountId, str(comment.contentId), str(article.wid), comment.content.replace('"','\\"'), comment.commenterNickName.replace('"','\\"'), time.strftime(ISOTIMEFORMAT, time.localtime(comment.createTime)), comment.likeCount )) for reply in comment.replyList: replyValueList.append('("%s","%s","%s","%s","%s","%s")'%( str(reply.replyId), str(comment.contentId), reply.content.replace('"','\\"'), time.strftime(ISOTIMEFORMAT, time.localtime(reply.createTime)), reply.uin, reply.toUin )) #clear the db firstly sql = 'delete from T_WEIXIN where wid in (%s) and pid="%s"' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId) dbProxy.execute(sql) sql = 'delete from T_WEIXIN_REPLY where cid in (select cid from T_WEIXIN_COMMENT where wid in (%s) and pid="%s")' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId) dbProxy.execute(sql) sql = 'delete from T_WEIXIN_COMMENT where wid in (%s) and pid="%s"' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId) dbProxy.execute(sql) #insert to db if len(weixinValueList)>0: self.logger.info('Insert %d records to weixin', len(weixinValueList)) dbProxy.execute(weixinSql +','.join(weixinValueList)) if len(commentValueList)>0: self.logger.info('Insert %d records to comment', len(commentValueList)) dbProxy.execute(commentSql +','.join(commentValueList)) if len(replyValueList)>0: self.logger.info('Insert %d records to reply', len(replyValueList)) dbProxy.execute(replySql +','.join(replyValueList)) def __writeCsv(self, fileName, articleList): #f = codecs.open(fileName,"w","utf-8") csvfile = file(fileName,'w') csvfile.write(codecs.BOM_UTF8) writer = csv.writer(csvfile) header = ['Title', 'Digest', 'Author', 'readCount', 'likeCount', 'publishDateTime', 'Comment-NickName', 'Comment-Content', 'Comment-likeCount', 'Comment-CreateTime'] writer.writerow(header) for article in articleList: writer.writerow( ( article.title.encode('utf8'), article.digest.encode('utf8'), article.author.encode('utf8'), article.readCount, article.likeCount, time.strftime(ISOTIMEFORMAT, time.localtime(article.publishDateTime)), '', '', '', '' ) ) for comment in article.commentList: writer.writerow( ( '', '', '', '', '', '', comment.commenterNickName.encode('utf8'), comment.content.encode('utf8'), comment.likeCount, time.strftime(ISOTIMEFORMAT, time.localtime(comment.createTime)) ) ) csvfile.close()
class DouBanCrawler(object): ''' classdocs ''' def __init__(self,channel,logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(sleepRange=[3, 8]) self.channel = channel self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.douban.com', 'Upgrade-Insecure-Requests': '1' } def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta(hours=1) # startTime=datetime.datetime(2017,11,20,23) page = 0 articleList = list() hasnext = True while hasnext: (articleListInPage, hasnext) = self.__searchByPage(keywordList, startTime, endTime, page) articleList.extend(articleListInPage) page += 50 return articleList def __searchByPage(self,keywordList,startTime,endTime,page): # 在豆瓣内部搜索框搜索 page = str(page) #url接收的是str格式 search_url = "https://www.douban.com/group/search?start="+page+"&cat=1013&sort=time&q=" #cat:按话题搜索 sort:按最新发布时间分类 q:搜索关键词 query = urllib.quote(' '.join(keywordList).encode('utf-8')) search_url = search_url+str(query) url_page = self.session.get(search_url,headers=self.headers) soup = BeautifulSoup(url_page, "lxml") # self.logger.debug(soup) main_wrap = soup.find('div', attrs={'class': "article"}) main_article_list = main_wrap.find('div',attrs={'class':"topics"}) articleList = list() hasnext = True if main_article_list is not None: title_list = main_article_list.findAll('tr', {'class': 'pl'}) for title in title_list: article_publishtime = title.find('td', attrs={'class': "td-time"}).attrs['title'] urlTime = time.strptime(article_publishtime, "%Y-%m-%d %H:%M:%S") Y, M, D, H = urlTime[0:4] urlTime2 = datetime.datetime(Y, M, D, H) urlTime2 = time.mktime(urlTime2.timetuple()) #转换成时间戳来比较 float类型 startTime = endTime - datetime.timedelta(days=2) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) #如果符合时间范围就爬取 if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond: article_url = title.find('a').attrs['href'] self.logger.debug(article_url) (content, author_name, tid,like_count) = self.crawlArticle(article_url) if content is None: # 话题已被删除或则其他格式 continue article_title = title.find('td', attrs={'class': "td-subject"}).text article = Article(tid, self.channel.channel_id, article_title, content, article_publishtime, article_url, None, author_name) self.crawlComment(article) reply_count = title.find('td', attrs={'class': "td-reply"}).text.strip() reply_count = re.sub(u'回应', '', reply_count) # 回复数量去除中文保留数字 article.statistics.reply_count = reply_count article.statistics.like_count = like_count if article not in articleList: articleList.append(article) else: print len(articleList) hasnext=False break return (articleList, hasnext) def crawlStatistics(self, article): #下次直接获得要统计的变量而不用爬整个网页 ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' try: article_main = self.session.get(article.url, headers=self.headers) soup = BeautifulSoup(article_main, 'lxml') comment_list = soup.find('ul',attrs={'id':"comments"}) #如果还需要其他统计数可以继续添加 comment_list_li = comment_list.findAll('li') article.statistics.reply_count = len(comment_list_li) #like_count页面 url = article.url + '?type=like' article_main = self.session.get(url, headers=self.headers) soup = BeautifulSoup(article_main, 'lxml') main = soup.find('div', attrs={'class': "article"}) lik_count_wrap = main.find('div', attrs={'class': "list topic-fav-list"}) lik_count = lik_count_wrap.findAll('li') article.statistics.like_count = len(lik_count) except: self.logger.error(u'该话题已被删除或有其他格式') def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' url = url+'?type=like' html = self.session.get(url,headers=self.headers) if html: print '111' soup = BeautifulSoup(html, 'lxml') # 'html.parser' 解析器 main = soup.find('div', attrs={'class': "article"}) if main is not None: print '222' Tauthor = main.find('span', attrs={'class': "from"}) if Tauthor is not None: Tauthor = Tauthor.find('a').text.strip() else: Tauthor = 'None' Tcontent = main.find('div', attrs={'class': "topic-content"}).text.strip() Tid_wrap = main.find('div', attrs={'class': "sns-bar"}) Tid = Tid_wrap.find('a').attrs['data-tid'] try: lik_count_wrap = main.find('div',attrs={'class':"list topic-fav-list"}) lik_count = lik_count_wrap.findAll('li') lik_count=len(lik_count) except: lik_count=0 return (Tcontent,Tauthor,Tid,lik_count) else: self.logger.error(u'该话题已被删除或存在其他格式') return (None,None,None,None) def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.lastCommentId = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' #self.logger.debug('Article:%s', article) html = self.session.get(article.url, headers=self.headers) add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") commentList = list() main_comment = BeautifulSoup(html,'lxml') try: commentList_html = main_comment.find('ul',attrs={'id':"comments"}) commentList_html_li = commentList_html.findAll('li') except: self.logger.error(u'该页面已被删除或则存在其他格式:%s',article.url) return (commentList, False) for comment in commentList_html_li: cid = comment.attrs['data-cid'] comment_main_wrap = comment.find('div',attrs={'class':"reply-doc content"}) user_id = comment_main_wrap.find('div',attrs={'class':"operation_div"}).attrs['id'] user_info_wrap = comment_main_wrap.find('div',attrs={'class':"bg-img-green"}) user_name = user_info_wrap.find('a').text.strip() publish_datetime = user_info_wrap.find('span').text.strip() content = comment_main_wrap.find('p').text.strip() reply_user_wrap = comment_main_wrap.find('div',attrs={'class':"reply-quote"}) if reply_user_wrap: reply_userid_wrap=reply_user_wrap.find('span',attrs={'class':"pubdate"}) reply_userid = reply_userid_wrap.find('a').attrs['href'] reply_userid = re.sub(r'\D', "", reply_userid) else: reply_userid = '' # like_count_wrap = comment_main_wrap.find('div',attrs={'class':"operation_div"}) # like_count = like_count_wrap.findAll('a')[1].text # like_count = re.sub(r'\D', "", like_count) #点赞数难获取不是页面自带的 like_count = None reply_count = None location_coutry = 'CN' commentList.append(Comment(article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, None,location_coutry,None,None, user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None )) return (commentList, False) #测试的时候 article[0][222].content 可以取出第222条的评论内容
class WetchartpublicCrawler(object): ''' calssdocs ''' def __init__(self, channel, logger=None): ''' 构造函数 :param channel: :param logger: ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.channel = channel def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 :param keywordList: 关键字数组 :param endTime: 搜索时间范围结束 :return: ''' for keyword in keywordList: pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword) self.logger.debug('pageUrl:%s', pageUrl) response = self.session.get(pageUrl, textRspOnly=False, headers=HEADERS_SOGOU) soup = BeautifulSoup(response.text) main = soup.find('ul', {'class': "news-list2"}) if main is None: self.logger.error('Fail to parse:%s', response.text) li_list = main.findAll('li') for li in li_list: a_title = li.find('p', {'class': "tit"}) if a_title is not None: title = str(a_title.text.strip()) if title == keyword: a_href = a_title.find('a')['href'] self.logger.debug(a_href) self.crawlWetchartpublic(a_href) def crawlWetchartpublic(self, url): response = self.session.get(url, textRspOnly=False, headers=HEADERS_SOGOU_PUBLIC) soup = BeautifulSoup(response.text) script_list = soup.findAll('script') for li in script_list: li_str = str(li.text) sub_str1 = "msgList = " sub_str2 = '}]};' if li_str.find(sub_str1) != -1: index1 = li_str.find(sub_str1) index2 = li_str.find(sub_str2) main = str(li.text)[index1 + len(sub_str1):index2 + 3] articleJson = json.loads(main) articlelist = articleJson['list'] for item in articlelist: mes_info = item['app_msg_ext_info'] url = 'https://mp.weixin.qq.com' + mes_info['content_url'] url = url.replace('amp;', '') self.crawlArticle(url) multi_item_list = mes_info['multi_app_msg_item_list'] for multi_item in multi_item_list: multi_url = 'https://mp.weixin.qq.com' + multi_item[ 'content_url'] multi_url = multi_url.replace('amp;', '') self.crawlArticle(multi_url) def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 :return:返回一个article实例 ''' self.session.randomSleep() response = self.session.get(url, textRspOnly=False) def crawlStatistics(self, article): ''' 爬去统计信息 :param article: :return: 无需返回参数,统计信息写入article实例 ''' pass def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' pass def crawlComment(self, article): ''' 根据文章,爬去文章的评论,返回评论列表 :param article: :return:(commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' pass
class ZhihuCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = SessionCrawler() self.channel = channel self.nextCommentUrl = None def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = list() for baiduUrl in urls: url = self.__fetchRealUrlFromBaiduUrl(baiduUrl) article = self.crawlArticle(url) if article is not None and article not in articleList: #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def __fetchRealUrlFromBaiduUrl(self, baiduUrl): ''' ''' response = self.session.session.get(baiduUrl, allow_redirects=False) if response.status_code == 302: return response.headers['Location'] def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' #判断url格式,因为从百度查询所得不一定是question,目前只爬question if url.find('question') < 0: self.logger.warn('Question supported only:%s', url) return None article_id = re.findall(r'question/(\d+)', url)[0] self.session.randomSleep() response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS) soup = BeautifulSoup(response) main = soup.find('div', attrs={'id': "data"}).attrs['data-state'] articleJson = json.loads(main) questionJson = articleJson['entities']['questions'][article_id] title = questionJson['title'] contentSoup = BeautifulSoup(questionJson['editableDetail']) content = contentSoup.text author_id = questionJson['author']['id'] author_name = questionJson['author']['name'] createTimeInFloat = questionJson['created'] publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeInFloat)) reply_count = questionJson['commentCount'] read_count = questionJson['visitCount'] collect_count = questionJson['followerCount'] article = Article(article_id, self.channel.channel_id, title, content, publish_datetime, url, author_id, author_name) article.statistics.reply_count = reply_count article.statistics.read_count = read_count article.statistics.collect_count = collect_count return article def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' articleCopy = self.crawlArticle(article.url) article.statistics.reply_count = articleCopy.statistics.reply_count article.statistics.read_count = articleCopy.statistics.read_count article.statistics.collect_count = articleCopy.statistics.collect_count def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.nextCommentUrl = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' if self.nextCommentUrl is None: curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0) curl = curl.replace('#', '%') else: curl = self.nextCommentUrl self.session.randomSleep() result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS) jo = json.loads(result) paging = jo['paging'] hasnext = not paging['is_end'] self.nextCommentUrl = paging['next'] dataList = jo['data'] add_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') commentList = list() for data in dataList: #self.logger.debug('[Zhihu]Comment data keys:%s', data.keys()) #self.logger.debug('[ZHIHU]Comment url for %s:%s', article.title, data['url']) publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time'])) comment = Comment(article.tid, article.channel_id, data['id'], add_datetime, publish_datetime, ip_address=None, location_country=None, location_region=None, location_city=None, author_id=data['author']['id'], author_name=data['author']['name'], content=data['content'], reply_author_id=None, read_count=None, like_count=data['voteup_count'], reply_count=data['comment_count'], dislike_count=None) commentList.append(comment) return (commentList, hasnext)
class WechatCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.session_public = SessionCrawler(logger=self.logger) self.session_public_article = SessionCrawler(logger=self.logger) self.channel = channel self.entityId = 'SYSU' def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeStr = startTime.strftime('%Y-%m-%d') endTimeStr = endTime.strftime('%Y-%m-%d') self.logger.debug('startTime:%s', startTimeStr) self.logger.debug('endTime:%s', endTimeStr) urlList = list() publicList = self.getPublic() articleList = self.searchPublic(publicList, endTime) for keyword in keywordList: # 忽略第一次,第一次不带时间范围 pageUrl = (SOUGO_WEIXIN_URL_INIT % (urllib.quote( keyword.encode('utf-8')), int(time.time() * 1000))).replace( '#', '%') self.logger.debug('pageUrl:%s', pageUrl) self.session.randomSleep() response = self.session.get(pageUrl, textRspOnly=False, headers=HEADERS_SOGOU) lastPageUrl = pageUrl pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote( keyword.encode('utf-8')), startTimeStr, endTimeStr) pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote( keyword.encode('utf-8'))) self.logger.debug('pageUrl:%s', pageUrl) while True: # 用上一次url作为这次的referer headers = HEADERS_SOGOU.copy() headers['Referer'] = lastPageUrl self.session.randomSleep() response = self.session.get(pageUrl, textRspOnly=False, headers=headers) soup = BeautifulSoup(response.text, 'lxml') main = soup.find('ul', {'class': "news-list"}) if main is None: self.logger.error('Fail to parse:%s', response.text) return [] li_list = main.findAll('li') #li_list有可能为空,但还可以翻页 for li in li_list: a_list = li.findAll('a') for a in a_list: if a['uigs'].startswith('article_title'): #self.logger.debug('Article title:%s',a.text) urlList.append((a['href'], pageUrl, a.text)) break pageBarList = soup.findAll('div', {'id': 'pagebar_container'}) if len(pageBarList) == 0: #没有翻页,直接退出 break pageBar = pageBarList[0] aList = pageBar.findAll('a') foundNextPage = False for a in aList: if a['uigs'] == 'page_next': foundNextPage = True lastPageUrl = pageUrl pageUrl = SOGOU_URL + a['href'] self.logger.debug('Found next page:%s', a.text) break if foundNextPage is False: break for item in urlList: article = self.crawlArticle(item[0], referer=item[1], title=item[2], flag=0) if article is not None: if article not in articleList: #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def searchPublic(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索公众号 :param keywordList: :param endTime: :return: ''' articleList = list() for keyword in keywordList: self.logger.debug(keyword) pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword[0]) self.logger.info('pageUrl:%s', pageUrl) self.session.randomSleep() response = self.session.get(pageUrl, textRspOnly=False, headers=HEADERS_SOGOU) soup = BeautifulSoup(response.text, 'lxml') main = soup.find('ul', {'class': "news-list2"}) if main is None: self.logger.error('Fail to parse:%s', response.text) li_list = main.findAll('li') for li in li_list: a_title = li.find('p', {'class': "tit"}) if a_title is not None: title = str(a_title.text.strip()) if title == keyword[0]: self.logger.debug(title) a_href = a_title.find('a')['href'] sub_articleList = self.crawlWetchartpublic(a_href) for article in sub_articleList: articleList.append(article) return articleList def crawlWetchartpublic(self, url): ''' 按公众号爬取文章 :param url: :return: ''' self.logger.debug(url) self.session_public.randomSleep() response = self.session_public.get(url, textRspOnly=False, headers=HEADERS_ARTICLE) soup = BeautifulSoup(response.text, 'lxml') self.logger.debug(soup) script_list = soup.findAll('script') # if len(script_list) == 0: # script_list = self.isCrawlerPublic(url) articleList = list() for li in script_list: li_str = str(li.text) sub_str1 = "msgList = " sub_str2 = '}]};' if li_str.find(sub_str1) != -1: index1 = li_str.find(sub_str1) index2 = li_str.find(sub_str2) main = str(li.text)[index1 + len(sub_str1):index2 + 3] articleJson = json.loads(main) articlelistJson = articleJson['list'] for item in articlelistJson: mes_info = item['app_msg_ext_info'] url = 'https://mp.weixin.qq.com' + mes_info['content_url'] url = url.replace('amp;', '') self.logger.debug('article_url:' + url) article = self.crawlArticle(url, flag=1) articleList.append(article) multi_item_list = mes_info['multi_app_msg_item_list'] for multi_item in multi_item_list: multi_url = 'https://mp.weixin.qq.com' + multi_item[ 'content_url'] multi_url = multi_url.replace('amp;', '') self.logger.debug('article_url:' + multi_url) article = self.crawlArticle(multi_url, flag=1) if article is not None: articleList.append(article) return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' pass def crawlArticle(self, url, **kwargs): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' #TBD, 转发情况目前先不考虑 searchTitle = '' referer = None if (kwargs['flag'] == 0): if 'referer' not in kwargs: return None if 'title' in kwargs: searchTitle = kwargs['title'] else: searchTitle = '' try: self.session_public_article.randomSleep() if (kwargs['flag'] == 0): referer = kwargs['referer'] headers = HEADERS_ARTICLE.copy() headers['Referer'] = referer response = self.session_public_article.get( url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE) else: response = self.session_public_article.get( url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE) mid = re.findall(r'var mid = .*"(\d+)";', response.text)[0] + '-' + re.findall( r'var idx = .*"(\d+)";', response.text)[0] soup = BeautifulSoup(response.text, 'lxml') main = soup.find('div', {'id': "img-content"}) title = main.find('h2').text.strip() content = main.find('div', {'id': "js_content"}).text.strip() profile = main.find('div', {'class': "profile_inner"}) author_id = profile.find('span').text.strip() author_name = profile.find('strong').text.strip() try: publish_datetime = main.find('em', { 'id': "post-date" }).text.strip() + ' 00:00:00' publish_datetime = datetime.datetime.strptime( publish_datetime, '%Y-%m-%d %H:%M:%S') except: self.logger.warn( 'Fail to parse publish_datetime, use current time as time') publish_datetime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') article = Article(mid, self.channel.channel_id, title, content, publish_datetime=publish_datetime, url=url, author_id=author_id, author_name=author_name, meta_info='{refer="%s"}' % referer) self.logger.debug('Successfully parse article:%s', title) return article except: self.logger.error('Fail to get article for %s: %s due to %s', url, searchTitle, traceback.format_exc()) return None def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' pass def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' return (list(), False) def getPublic(self): self.conf = self.__readDbConf() publictablename = Constants.TABLE_SA_WETCHARTPUBLIC + Constants.TABLE_NAME_DELIMITER + self.entityId sql = ''' SELECT public_name FROM %s ''' % (publictablename) dbProxy = MySqlProxy(self.conf[CONF_FILE_DBCONF][CONF_FILE_DBHOST], 3306, self.conf[CONF_FILE_DBCONF][CONF_FILE_DBUSER], self.conf[CONF_FILE_DBCONF][CONF_FILE_DBPASS], self.conf[CONF_FILE_DBCONF][CONF_FILE_DBNAME]) # dbProxy = MySqlProxy('localhost', 3306, 'root', 'zzm15331411', 'sentiment_re') # dbProxy = MySqlProxy('112.124.47.197', 3306, 'test', 'test', 'sa2') dbProxy.execute(sql) resultList = dbProxy.fetchall() # resultList = [(u'今日中大',),] return resultList def __readDbConf(self): fileName = os.path.join('conf', CONF_FILE_NAME) c = Configuration(fileName) return c.readConfig() def getProxyPageList(self, url): url_list = [] for i in range(1, 100): url_new = url + str(i) url_list.append(url_new) return url_list def getProxyIpList(self, url): response = self.session.get(url, textRspOnly=False, headers=PROXY_IP_URL_HEADER, timeout=2) host_list = etree.HTML(response.text).xpath( '//table[contains(@id,"ip_list")]/tr/td[2]/text()') port_list = etree.HTML(response.text).xpath( '//table[contains(@id,"ip_list")]/tr/td[3]/text()') ip_list = list() for i in range(0, len(host_list)): ip = host_list[i] + r':' + port_list[i] ip_list.append(ip) return ip_list def verifyIp(self, ip): proxy = {'http': 'http://' + ip} proxy_handler = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(proxy_handler) urllib2.install_opener(opener) test_url = 'http://www.baidu.com' req = urllib2.Request(url=test_url, headers=PROXY_IP_URL_HEADER) try: res = urllib2.urlopen(req) content = res.read() if content: self.logger.debug(r'https://' + ip + ' is OK') else: self.logger.debug(r'https://' + ip + ' is BAD') ip = "" return ip except urllib2.URLError as e: self.logger.debug(r'https://' + ip + ' ' + str(e.reason)) return "" except: self.logger.debug(r'https://' + ip + ' Other Error') return "" def getVaildIp(self): page_url_list = self.getProxyPageList(PROXY_IP_URL) ip_list = list() for page_url in page_url_list: page_ip_list = self.getProxyIpList(page_url) for ip in page_ip_list: ip = self.verifyIp(ip) if ip != "": ip_list.append(ip) return ip_list def isCrawlerPublic(self, url): data_script_list = list() page_url_list = self.getProxyPageList(PROXY_IP_URL) flag = 0 for page_url in page_url_list: page_ip_list = self.getProxyIpList(page_url) flag = 0 for ip in page_ip_list: flag = 0 ip = self.verifyIp(ip) if ip != "": # self.session_public.randomSleep() proxy = {'http': r'http://' + ip} try: response = self.session_public.get( url, textRspOnly=True, headers=HEADERS_ARTICLE, proxies=proxy) soup = BeautifulSoup(response.text, 'lxml') self.logger.debug(response.text) script_list = soup.findAll('script') if len(script_list) != 0: flag = 0 for li in script_list: li_str = str(li.text) sub_str1 = "msgList = " if li_str.find(sub_str1) != -1: data_script_list.append(li) flag = 1 break if flag == 1: break except: self.logger.debug( 'The ip can not be used to crawler public') if flag == 1: break if (flag == 0): self.logger.debug('The ip can not be used to crawler public') return data_script_list
class WeiboCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel self.entityId = 'SYSU' self.user_name = None self.pass_word = None self.email = SendEmail() self.db = InsertDB() self.monitor_title = 'Weibo crawler monitor' self.HTTP = 'https:' self.session = SessionCrawler(sleepRange=[3, 8]) def change_cookie(self, user_name=None): ''' 随机获取一个cookie ''' # usename_list = [ # '18814095644','13432881156','*****@*****.**','15018377821','*****@*****.**', # '15767199023','13427287354','13532011721','13640792755','13794342903', # '17825769929','18211493432','17827278983','13922771190','15999916968', # '15119820746','15986585396','13430915912','15012471375','17880567972', # '17876156948','15915132451','13543985544','13717382951','13640695490', # '15711707673','13680181412','13414759320','17820956139','18476072534', # '17806699214','13418852766','17827181603','15919354070','15088137907' # ] user_name_list = [ '13432881156', '*****@*****.**', '15018377821', '*****@*****.**', '15767199023', '18814095644' ] if user_name is None: user_name = random.choice(user_name_list) else: user_name = user_name self.user_name = user_name def get_username_password(self, user_name=None): ''' 随机赋值用户和密码 :return: ''' # user_name_password1 = '18814095644:ljda.18814095644' # user_name_password2 = '13432881156:liang452035397' # user_name_password3 = '[email protected]:810214bee810214' # user_name_password4 = '15018377821:zzm15331411' # user_name_password5 = '15767199023:j980216' # user_name_password6 = '[email protected]:uwinvip' user_dict = { '18814095644': 'ljda.18814095644', '13432881156': 'liang452035397', '*****@*****.**': '810214bee810214', '15018377821': 'zzm15331411', '15767199023': 'j980216', '*****@*****.**': 'uwinvip' } user_name_list = [ '13432881156', '*****@*****.**', '15018377821', '*****@*****.**', '15767199023', '18814095644' ] # user_list = [user_name_password1, user_name_password2, # user_name_password3, user_name_password4, # user_name_password5, user_name_password6] # 判断是否存在user_name if user_name is None: user_name = random.choice(user_name_list) else: user_name = user_name user_password = user_dict[user_name] self.user_name = user_name self.pass_word = user_password def searchArticle(self, keyword_list, end_time): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' # 手动更新cookie # self.__login() run_msg = '微博爬虫开始运行' self.db.Insert(self.channel.channel_id, self.entityId, run_msg) # 搜索时间 start_time = end_time - datetime.timedelta(hours=2.5) page = 1 count_page = 0 article_list = list() page_url_list = self.__search_page_url(keyword_list, start_time, end_time, page) # 如果没获取到数据,则通过这个user_name进行登录获取新的cookie if len(page_url_list) == 0: self.__login(self.user_name) page_url_list = self.__search_page_url(keyword_list, start_time, end_time, page) for page_url in page_url_list: time.sleep(1.5) count_page += 1 self.logger.debug('Start to crawl: %s' % page_url) article_list_page = self.__parse_page_url(page_url, count_page) article_list.extend(article_list_page) if article_list is None: article_msg = '微博没有爬取到数据' self.email.send(self.monitor_title, article_msg) self.db.Insert(self.channel.channel_id, self.entityId, article_msg) return article_list def __search_page_url(self, keywordList, startTime, endTime, page): ''' :param keywordList: 关键词 :param startTime: 搜索开始时间 :param endTime: 搜索结束时间 :param page: 页码 :return: 返回微博内容列表的链接 ''' query = urllib.quote(' '.join(keywordList).encode('utf-8')) params = { 'typeall': '1', 'suball': '1', # 包含全部 'timescope': 'custom:%s:%s' % (startTime.strftime("%Y-%m-%d-%H"), (endTime.strftime("%Y-%m-%d-%H"))), # 时间 # 微博搜索的时间范围格式不同 不能写%Y-%m-%d-%H 'Refer': 'g', 'page': page } user_agent = random.choice(AGENTS) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 's.weibo.com', 'Referer': 'http://s.weibo.com/', 'User-Agent': user_agent } index_url = 'http://s.weibo.com/weibo/' + query # 搜索主页+ # 随机选取cookie self.change_cookie() self.logger.debug('Use Cookie %s' % self.user_name) # 若加载的cookie没有,则重新登录 try: cookies = self.__load_cookies_from_lwp(self.user_name) html = self.session.get(index_url, params=params, headers=headers, cookies=cookies) # 加载本地cookies # print(html) response = etree.HTML(html) # 获取总共有多少页 page_urls = response.xpath( '//div[@class="m-page"]//span[@class="list"]//li/a/@href') page_url_list = list() for page in page_urls: url = self.HTTP + page page_url_list.append(url) print(url) return page_url_list except Exception as e: loginFlag = self.__login(self.user_name) self.logger.debug('Use username: %s' % self.user_name) if loginFlag is False: self.logger.error('Fail to login') login_msg = 'Weibo login field! Username is %s' % self.user_name self.email.send(self.monitor_title, login_msg) self.db.Insert(self.channel.channel_id, self.entityId, login_msg) return cookies = self.__load_cookies_from_lwp(self.user_name) self.logger.debug('Get a new Cookie: %s' % cookies) try: html = self.session.get(index_url, params=params, headers=headers, cookies=cookies) # 加载本地cookies response = etree.HTML(html) # 获取总共有多少页 page_urls = response.xpath( '//div[@class="m-page"]//span[@class="list"]//li/a/@href') page_url_list = list() for page in page_urls: url = self.HTTP + page page_url_list.append(url) return page_url_list except Exception as e: self.logger.warning('Crawler failed: %s' % e) msg = 'No return data, because of %s' % e self.email.send(self.monitor_title, msg) self.db.Insert(self.channel.channel_id, self.entityId, msg) def __parse_page_url(self, page_url, count_page=0): ''' 获取每一页的微博信息 @return: article_list ''' user_agent = random.choice(AGENTS) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 's.weibo.com', 'Referer': 'http://s.weibo.com/', 'User-Agent': user_agent } self.change_cookie() cookies = self.__load_cookies_from_lwp(self.user_name) response = self.session.get(url=page_url, headers=headers, cookies=cookies) # 若此cookie无效,则重新登录获取新的cookie if re.match(r'File not .*', response) is not None: self.__login(self.user_name) cookies = self.__load_cookies_from_lwp(self.user_name) response = self.session.get(url=page_url, headers=headers, cookies=cookies) html = etree.HTML(response) # 内容list card_wrap_list = html.xpath( '//div[@class="m-wrap"]/div[@id="pl_feedlist_index"]//div[@class="card-wrap"]' ) article_list = list() for card_wrap in card_wrap_list: try: mid = card_wrap.xpath('./@mid')[0] except Exception as e: continue info = card_wrap.xpath('.//div[@class="content"]')[0] author_name = info.xpath('.//a[@class="name"]/text()')[0] author_id = re.search( r'/(\d+).?refer_flag', info.xpath('.//a[@class="name"]/@href')[0]).group(1) txt = info.xpath('.//p[@class="txt"]')[0] content = txt.xpath('string(.)').strip() title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' article_info = info.xpath('.//p[@class="from"]/a')[0] article_url = self.HTTP + article_info.xpath('./@href')[0] statistics = card_wrap.xpath( './/div[@class="card-act"]//li/a/text()') collect_count = re.search(r'(\d+)', statistics[0]) forward_count = re.search(r'(\d+)', statistics[1]) reply_count = re.search(r'(\d+)', statistics[2]) like_count = card_wrap.xpath( './/div[@class="card-act"]//li/a/em/text()') if collect_count: collect_count = collect_count.group(1).strip() else: collect_count = 0 if forward_count: forward_count = forward_count.group(1).strip() else: forward_count = 0 if reply_count: reply_count = reply_count.group(1).strip() else: reply_count = 0 if len(like_count): like_count = like_count[0] else: like_count = 0 publish_datetime = self.__parse_time(article_url, cookies) article = Article(mid, self.channel.channel_id, title, content, url=article_url, publish_datetime=publish_datetime, author_id=author_id, author_name=author_name) article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count article.statistics.reply_count = reply_count article.statistics.like_count = like_count if article not in article_list: article_list.append(article) self.logger.debug('%d page crawl article is %s' % (count_page, len(article_list))) return article_list def __parse_time(self, article_url, cookies): time.sleep(1.5) url = '%s&type=comment' % article_url user_agent = random.choice(AGENTS) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'User-Agent': user_agent } response = self.session.get(url, headers=headers, cookies=cookies) try: date_str = re.search(r'date=\\"(\d+)\\"', response).group(1) localtime = time.localtime(int(date_str[0:10])) publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', localtime) except Exception as e: publish_datetime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') return publish_datetime def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' # return try: (data, check) = self.__fetchSingleArticle(article) if check == '0': soup = BeautifulSoup(data, 'lxml') ulList = soup.findAll( 'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'}) li_list = ulList[0].findAll('li') self.__parseStatistics(article, li_list) elif check == '1': self.logger.warning(u'要访问的网页404了:%s', article.url) return else: self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url) return except: self.logger.error('Fail to fetch statistics for:%s, %s', article.url, traceback.format_exc()) return def __parseStatistics(self, article, soup_li_list): # 新版 collect_count = soup_li_list[0].find('span').text collect_count = re.findall(r'\d+', collect_count) if len(collect_count) > 0: collect_count = int(collect_count[0]) else: collect_count = 0 forward_count = soup_li_list[1].find('span').text forward_count = re.findall(r'\d+', forward_count) if len(forward_count) > 0: forward_count = int(forward_count[0]) else: forward_count = 0 reply_count = soup_li_list[2].find('span').text reply_count = re.findall(r'\d+', reply_count) if len(reply_count) > 0: reply_count = int(reply_count[0]) else: reply_count = 0 like_count = soup_li_list[3].find('span').text like_count = re.findall(r'\d+', like_count) if len(like_count) > 0: like_count = int(like_count[0]) else: like_count = 0 article.statistics.reply_count = reply_count article.statistics.like_count = like_count article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count def __fetchSingleArticle(self, article): ''' 根据文章url获取文章 ''' user_agent = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; Trident/5.0)", ] randdom_header = random.choice(user_agent) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'weibo.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': '%s' % randdom_header } data2 = {'type': 'comment'} print article.url time.sleep(2) self.change_cookie() cookies = self.__load_cookies_from_lwp(self.user_name) html = self.session.get(article.url, headers=headers, params=data2, cookies=cookies) # 若cookie无效,则重新登录获取 if re.match(r'File not .*', html) is not None: self.__login(self.user_name) cookies = self.__load_cookies_from_lwp(self.user_name) html = self.session.get(article.url, headers=headers, params=data2, cookies=cookies) if html.find('<div class="page_error">') == -1: lindex = html.find( '<script>FM.view({"ns":"pl.content.weiboDetail.index"') rindex = html[lindex:].find('</script>') rindex = lindex + rindex - 1 lindex = lindex + len('<script>FM.view(') # self.logger.debug(html[lindex:rindex]) try: jo = json.loads(html[lindex:rindex]) data = jo['html'] # 实时微博页面 return (data, '0') except: return ({}, '1') else: return ({}, '2') def __get_username(self, user_name): """ get legal username """ username_quote = urllib.quote(user_name) username_base64 = base64.b64encode(username_quote.encode("utf-8")) return username_base64.decode("utf-8") def __get_password(self, servertime, nonce, pubkey): """ get legal password """ string = (str(servertime) + "\t" + str(nonce) + "\n" + str(self.pass_word)).encode("utf-8") public_key = rsa.PublicKey(int(pubkey, 16), int("10001", 16)) password = rsa.encrypt(string, public_key) password = binascii.b2a_hex(password) return password.decode() def __get_json_data(self, su_value): """ get the value of "servertime", "nonce", "pubkey", "rsakv" and "showpin", etc """ params = { "entry": "weibo", "callback": "sinaSSOController.preloginCallBack", "rsakt": "mod", "checkpin": "1", "client": "ssologin.js(v1.4.18)", "su": su_value, "_": int(time.time() * 1000), } try: response = self.session.get( "http://login.sina.com.cn/sso/prelogin.php", params=params) json_data = json.loads( re.search(r"\((?P<data>.*)\)", response).group("data")) except Exception: json_data = {} self.logger.error("WeiBoLogin get_json_data error: %s", traceback.format_exc()) self.logger.debug("WeiBoLogin get_json_data: %s", json_data) return json_data def __login(self, user_name=None): self.user_uniqueid = None self.user_nick = None # 根据传入的user_name获取用户名和密码 self.get_username_password(user_name) # get json data s_user_name = self.__get_username(self.user_name) # user_name self.logger.debug('login weibo use %s' % self.user_name) json_data = self.__get_json_data(su_value=s_user_name) if not json_data: return False s_pass_word = self.__get_password(json_data["servertime"], json_data["nonce"], json_data["pubkey"]) # make post_data post_data = { "entry": "weibo", "gateway": "1", "from": "", "savestate": "7", "userticket": "1", "vsnf": "1", "service": "miniblog", "encoding": "UTF-8", "pwencode": "rsa2", "sr": "1280*800", "prelt": "529", "url": "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack", "rsakv": json_data["rsakv"], "servertime": json_data["servertime"], "nonce": json_data["nonce"], "su": s_user_name, "sp": s_pass_word, "returntype": "TEXT", } # get captcha code if json_data["showpin"] == 1: # captcha_msg = '微博爬虫进入验证码页面,虫子已被反爬' # self.email.send(self.monitor_title,captcha_msg) # self.db.Insert(self.channel.channel_id,self.entityId,captcha_msg) url = "http://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int( time.time()), json_data["pcid"]) print(url) with open("captcha.jpg", "wb") as file_out: file_out.write( self.session.get(url, textRspOnly=False).content) print self.session.get(url, textRspOnly=False).content code = raw_input("请输入验证码:") post_data["pcid"] = json_data["pcid"] post_data["door"] = code # login weibo.com login_url_1 = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)&_=%d" % int( time.time()) json_data_1 = self.session.post(login_url_1, data=post_data, textRspOnly=False).json() if json_data_1["retcode"] == "0": params = { "callback": "sinaSSOController.callbackLoginStatus", "client": "ssologin.js(v1.4.18)", "ticket": json_data_1["ticket"], "ssosavestate": int(time.time()), "_": int(time.time() * 1000), } response = self.session.get( "https://passport.weibo.com/wbsso/login", textRspOnly=False, params=params) json_data_2 = json.loads( re.search(r"\((?P<result>.*)\)", response.text).group("result")) if json_data_2["result"] is True: self.user_uniqueid = json_data_2["userinfo"]["uniqueid"] self.user_nick = json_data_2["userinfo"]["displayname"] self.logger.info("WeiBoLogin succeed!") self.__save_cookies_lwp(response.cookies, self.user_name) # 保存cookies到本地 else: self.logger.warning("WeiBoLogin failed: %s", json_data_2) else: self.logger.warning("WeiBoLogin failed: %s", json_data_1) return True if self.user_uniqueid and self.user_nick else False def __save_cookies_lwp(self, cookiejar, usename): """ 保存cookies到本地 """ filename = 'sina_cookie_pool/sinaweibocookies_%s' % usename lwp_cookiejar = cookielib.LWPCookieJar() for c in cookiejar: args = dict(vars(c).items()) args['rest'] = args['_rest'] del args['_rest'] c = cookielib.Cookie(**args) lwp_cookiejar.set_cookie(c) lwp_cookiejar.save(filename, ignore_discard=True) def __load_cookies_from_lwp(self, usename): """ 读取本地cookies """ lwp_cookiejar = None try: filename = 'sina_cookie_pool/sinaweibocookies_%s' % usename self.logger.debug('Use cookie filename is %s' % usename) lwp_cookiejar = cookielib.LWPCookieJar() lwp_cookiejar.load(filename, ignore_discard=True) except: self.__login() filename = 'sina_cookie_pool/sinaweibocookies_%s' % '*****@*****.**' self.logger.debug('cookies_filename: %s' % filename) lwp_cookiejar = cookielib.LWPCookieJar() lwp_cookiejar.load(filename, ignore_discard=True) return lwp_cookiejar def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.lastCommentId = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' # return (list(), False) commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") headers = { 'Accept': '* / *', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content - Type': 'application / x - www - form - urlencoded', 'Host': 'weibo.com', 'Referer': '%s' % article.url[:article.url.find('?') + 1], 'refer_flag': '1001030103_ & type = comment', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' } data1 = {'ajwvr': '6', 'id': '%s' % article.tid, 'from': 'singleWeiBo'} re_url = 'https://weibo.com/aj/v6/comment/big' html = self.session.get(re_url, params=data1, headers=headers, cookies=self.__load_cookies_from_lwp( self.user_name)) jo = json.loads(html) data = jo['data']['html'] # self.logger.error( data) soup = BeautifulSoup(data, 'lxml') if soup.find('div', {'class': "WB_empty"}) is None: commentbox = soup.find('div', {'class': "list_box"}) root_commentlist = commentbox.find('div', {'node-type': "comment_list"}) root_commentlist2 = root_commentlist.find_all( 'div', {'node-type': "root_comment"}) count_comment = 1 self.logger.debug('root_commentlist:%d %s', len(root_commentlist2), article.url) for root_comment in root_commentlist2: if count_comment > 20: break # 爬取前20条评论 self.logger.error('count_comment:%d', count_comment) comment_id = root_comment.attrs['comment_id'] # 一级评论id list_con = root_comment.find('div', {'class': "list_con"}) firstcomentwrap = list_con.find('div', {'class': "WB_text"}) firstcoment = firstcomentwrap.text.strip() # 一级评论内容 useridwrap = firstcomentwrap.find('a') user_id = useridwrap.attrs['usercard'] # 一级评论者id user_id = re.findall(r'id=(\d+)', user_id)[0] user_name = useridwrap.test # 一级评论者name if user_name is None: user_name = ' ' publish_timediv = list_con.find('div', {'class': "WB_func clearfix"}) try: publish_time = self.parseDateTime( publish_timediv.findAll('div')[1].text) # 一级评论发布时间 except: continue # self.logger.error(publish_time) # if publish_time < datetime.datetime.now()-datetime.timedelta(hours=48): # break like_count_div = publish_timediv.findAll('div')[0] try: like_count_li = like_count_div.findAll('li')[3] like_count = like_count_li.findAll('em')[1].text except: like_count = 0 if u'赞' == like_count: like_count = 0 commentList.append( Comment(article.tid, self.channel.channel_id, comment_id, add_datetime, publish_time, None, None, None, None, user_id, user_name, firstcoment, None, None, like_count, None, dislike_count=None)) count_comment += 1 # 评论数计数 return (commentList, False) def parseDateTime(self, datetimeStr): if datetimeStr.find(u'\u79d2\u524d') > 0: secondsDelta = float(datetimeStr.replace(u'\u79d2\u524d', '')) return datetime.datetime.now() - datetime.timedelta( seconds=secondsDelta) if datetimeStr.find(u'\u5206\u949f\u524d') > 0: secondsDelta = float(datetimeStr.replace(u'\u5206\u949f\u524d', '')) * 60 return datetime.datetime.now() - datetime.timedelta( seconds=secondsDelta) if datetimeStr.find(u'\u4eca\u5929') >= 0: datetimeStr = datetime.datetime.today().strftime( '%Y-%m-%d') + datetimeStr.replace(u'\u4eca\u5929', '') return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M') if datetimeStr.find(u'\u6708') >= 0: datetimeStr = str(datetime.datetime.today().year) + '-' + \ datetimeStr.replace(u'\u6708', '-').replace(u'\u65e5','') return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M') return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M')