class BaiduCrawler(object): ''' classdocs ''' def __init__(self, logger=None): ''' Constructor ''' self.session = SessionCrawler() if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger def search(self, site, keywordList, startTimeIntSecond, endTimeIntSecond): keywordStr = ' | '.join(keywordList) qstr = 'site:' + '(' + site + ') (' + keywordStr + ')' # 20180712.Jondar 改 intime = 'stf=%d,%d|stftype=2' % (startTimeIntSecond, endTimeIntSecond) page = 1 urlList = list() while page < MAXPAGE: #计算分页数 pageParam = str(page - 1) + '0' data = { 'gpc': intime, # 'si':'('+site+')', 'si': site, 'bs': qstr, 'wd': qstr, 'oq': qstr, 'pn': pageParam } response = self.session.download(URL, encoding='utf-8', data=data, addr=True) soup = BeautifulSoup(response['html'], 'html.parser') #查找页面的url divContent = soup.find('div', attrs={'id': "content_left"}) if divContent is None: break urlDivList = divContent.find_all( 'div', attrs={'class': "result c-container "}) for urlDiv in urlDivList: try: url = urlDiv.find('h3').find('a').attrs['href'] # tasks.put_nowait(url) urlList.append(url) except: continue #翻页控制 pageHrefList = soup.find('div', attrs={'id': "page"}).find_all('a') #pageList = map(lambda x: int(str(x.text.strip())), pageHrefList) pageList = [] for p in pageHrefList: pa = p.text.strip() pageList.append(pa) if str(page + 1) in pageList: page += 1 else: break return urlList
class BaiduTieBaCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.site = 'tieba.baidu.com' # 搜索站点 self.url = 'http://tieba.baidu.com/f/search/res' self.tiebaUrl_list = list() self.session = SessionCrawler(sleepRange=[3, 5]) self.channel = channel self.count_page = 0 #优化时间,使进入一次便可以 self.pageList = [] #保存__searchByPage 里搜索到的url页数 self.articleList = list() #全局去重复文章变量 def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param startTime: 搜索时间范围起始 @param endTime: 搜索时间范围结束 ''' page = 1 hasnext = True # self.logger.info(u'这里等待1') startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) while hasnext: (urllist, hasnext) = self.__searchByPage(keywordList, startTime, endTime, page) self.__parseSearchPage(urllist) # 根据url找相关帖子 page += 1 return self.articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' html = self.session.download(article.url, encoding='utf-8', data=None, isJson=False, timeout=10, retry=3, addr=True) soup = BeautifulSoup(html['html'], 'lxml') try: Treply = soup.find('li', attrs={ 'class': "l_reply_num" }).find('span').text.strip() # 总回复数 article.statistics.reply_count = Treply except: return #先通过贴吧搜索栏查询url列表 def __searchByPage(self, keywordList, startTime, endTime, page): # self.logger.info(u'这里等待2') data = { 'ie': 'utf-8', 'kw': '', # 贴吧名称 'qw': keywordList, # 关键字 # 'rn': '60', # 显示条数 'un': '', # 用户名 'only_thread': '1', 'sm': '1', # 按时间倒序 # 'timescope': 'custom:%s:%s' % (startTime.strftime("%Y-%m-%d-%H"), (endTime+datetime.timedelta(hours=1)).strftime("%Y-%m-%d-%H")), # 时间 'sd': '', 'ed': '', 'pn': page # 页数 } html = self.session.download(self.url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=True) soup = BeautifulSoup(html['html'], 'html.parser') main = soup.find('div', attrs={'class': "s_post_list"}) try: urls = main.find_all('div', attrs={'class': "s_post"}) #在贴吧搜索中获取url list except: self.logger.warn(u'找不到url') return self.logger.debug(u'共找到%d条url', len(urls)) if self.count_page == 0: #只进入一次(计算分页数) self.count_page = 1 pages = soup.find('div', attrs={ 'class': "pager pager-search" }).find_all('a') for p in pages: pa = p.text.strip() self.pageList.append(pa) if str(page + 1) in self.pageList: hasnext = True else: hasnext = False for i in urls: urlTime = i.find('font', attrs={ 'class': "p_green p_date" }).text.strip() urlTime = time.strptime(urlTime, "%Y-%m-%d %H:%M") Y, M, D, H = urlTime[0:4] urlTime2 = datetime.datetime(Y, M, D, H) if urlTime2 >= startTime and urlTime2 <= endTime: # self.logger.error(u"时间比较") try: url = i.find('span', attrs={ 'class': "p_title" }).find('a').attrs['data-tid'] url = 'http://tieba.baidu.com/p/' + url if url not in self.tiebaUrl_list: self.tiebaUrl_list.append(url) except: continue else: hasnext = False break return (self.tiebaUrl_list, hasnext) def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' article = None html = self.session.download(url, encoding='utf-8', data=None, isJson=False, timeout=10, retry=3, addr=True) article_url = html['url'] # if article_url.find(self.channel.url)<0: # self.logger.warn('Unrelated url found:%s',url) # continue # self.logger.debug(article_url) soup = BeautifulSoup(html['html'], 'html.parser') noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return article main = soup.find('div', attrs={'class': "left_section"}) if main: Ttitle = main.find('div', attrs={ 'id': "j_core_title_wrap" }).find('h1') Ttitle1 = main.find('div', attrs={ 'id': "j_core_title_wrap" }).find('h3') if Ttitle: Ttitle = Ttitle.text.strip() elif Ttitle1: Ttitle = Ttitle1.text.strip() else: Ttitle = '' # self.logger.debug(u'标题%s',Ttitle) data_field = main.find('div', attrs={ 'id': "j_p_postlist" }).find('div').attrs['data-field'].strip() data_field = json.loads(data_field) publish_datetime = data_field['content'] if 'date' in publish_datetime.keys(): publish_datetime = publish_datetime['date'] else: publish_datetime = main.find('div', attrs={ 'id': "j_p_postlist" }).find('div').find_all( 'span', attrs={'class': "tail-info" })[-1].text.strip() publish_datetime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', publish_datetime)[0] if len(publish_datetime[4]) > 1: publish_datetime = publish_datetime[ 0] + '-' + publish_datetime[1] + '-' + publish_datetime[ 2] + ' ' + publish_datetime[3] + publish_datetime[4] else: publish_datetime = publish_datetime[ 0] + '-' + publish_datetime[1] + '-' + publish_datetime[ 2] + ' ' + publish_datetime[3] + ':00' Tid = data_field['author']['user_id'] Tauthor = data_field['author']['user_name'] Treply = soup.find('li', attrs={ 'class': "l_reply_num" }).find('span').text.strip() #总回复数 Tcontent = main.find('div', attrs={ 'id': "j_p_postlist" }).find('div').find('cc').text.strip() article = Article(Tid, self.channel.channel_id, Ttitle, Tcontent, publish_datetime, url=article_url, author_id=None, author_name=Tauthor, meta_info=None) article.statistics.reply_count = Treply else: self.logger.warn(u'很抱歉,该贴已被删除。%s', article_url) return article #通过找到的url列表一个个访问文章 def __parseSearchPage(self, urllist): ''' @return: (articleList,hasnext) ''' self.logger.info(u'这里在访问收集到的url,请稍等') for url in urllist: article = self.crawlArticle(url) if article not in self.articleList and article is not None: self.articleList.append(article) # return articleList2 def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.lastCommentId = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") page = 1 while page <= 30: data = {'pn': page} html = self.session.download(article.url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=True) article_url = article.url # print article_url soup = BeautifulSoup(html['html']) try: main = soup.find('div', attrs={'class': "left_section"}) main = main.find('div', attrs={'id': "j_p_postlist"}) except: self.logger.warn(u'很抱歉,该贴已被删除。%s', article_url) return (commentList, False) sectionsite = main.find_all('div', attrs={'class': "l_post"}) # self.logger.error(len(sectionsite)) index = 0 if main: com_all = main.find_all('div', attrs={'data-field': True}) for i in sectionsite[2:]: # self.logger.warn(i) index = index + 1 if com_all[index].attrs['data-field']: try: data_field = i.attrs['data-field'].strip() except: self.logger.error(u'存在未找到的data-field') self.logger.error(article_url) continue data_field = json.loads(data_field) if 'content' in data_field.keys(): # self.logger.warn(u'这里真的会不糊出错2') cid = data_field['content']['post_id'] user_id = data_field['author']['user_id'] user_name = data_field['author']['user_name'] # user_ip = '' # ip_address = '' # user_head = '' if 'date' in data_field['content'].keys(): # self.logger.warn(u'这里没有出错%s', article_url) cpublish_datetime = data_field['content'][ 'date'] else: # self.logger.warn(u'这里出错%s',article_url) cpublish_datetime = i.findAll('span') cpublish_datetime = cpublish_datetime[ -1].text.strip() if u'广告' in cpublish_datetime: continue cpublish_datetime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', cpublish_datetime)[0] if len(cpublish_datetime[4]) > 1: cpublish_datetime = cpublish_datetime[0] + '-' + cpublish_datetime[1] + '-' + \ cpublish_datetime[2] + ' ' + cpublish_datetime[3] + cpublish_datetime[4] else: cpublish_datetime = cpublish_datetime[0] + '-' + cpublish_datetime[1] + '-' + \ cpublish_datetime[2] + ' ' + cpublish_datetime[3] + ':00' # reply_userid = '' # like_count = # unlike_count = -1 # read_count = -1 reply_count = data_field['content']['comment_num'] source_url = article_url content = i.find('cc').text.strip() location_coutry = 'CN' # channeltype = 'tieba' # channel = self.site # heat = 0 commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, cpublish_datetime, None, location_coutry, None, None, user_id, user_name, content, None, None, None, reply_count, dislike_count=None)) # 翻页控制 pages = soup.find('li', attrs={'class': "l_pager"}).find_all('a') pageList = [] for p in pages: pa = p.text.strip() pageList.append(pa) if str(page + 1) in pageList: page += 1 else: break return (commentList, False)
class TianYaCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.site = 'bbs.tianya.cn' self.url = 'http://search.tianya.cn/bbs?&s=4&f=0' self.luntanUrl_list = list() self.session = SessionCrawler(sleepRange=[3, 5]) self.channel = channel self.count_page = 0 self.pageList = [] self.articleList = list() # 全局去重复文章变量 def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param startTime: 搜索时间范围起始 @param endTime: 搜索时间范围结束 ''' page = 1 articleList = list() hasnext = True while hasnext: startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) (urllist, hasnext) = self.__searchByPage(keywordList, startTime, endTime, page) self.logger.error(len(urllist)) # if len(urllist) == 0: # self.logger.warn(u'搜索关键词结果为空::%s',keywordList) # break self.__parseSearchPage(urllist) # 根据url找相关帖子 page += 1 return self.articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' html = self.session.download(article.url, encoding='utf-8', data=None, isJson=False, timeout=10, retry=3, addr=True) soup = BeautifulSoup(html['html'], 'lxml') try: Treply = soup.find('div', attrs={ 'class': "atl-info" }).find_all('span')[3].text.strip() # 总回复数 Treply = re.sub(u'[\u4e00-\u9fa5]+:', '', Treply) Treply = int(Treply) Tclick = soup.find('div', attrs={ 'class': "atl-info" }).find_all('span')[2].text.strip() # 总回复数 Tclick = re.sub(u'[\u4e00-\u9fa5]+:', '', Tclick) Tclick = int(Tclick) article.statistics.reply_count = Treply article.statistics.reply_count = Tclick except: self.logger.debug(u'无权限访问的网址:%s', article.url) #先通过论坛搜索栏查询url列表 def __searchByPage(self, keywordList, startTime, endTime, page): data = {'q': keywordList, 's': '4', 'pn': page, 'f': '0'} html = self.session.download(self.url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=True) soup = BeautifulSoup(html['html'], 'html.parser') urls = soup.find('div', attrs={ 'class': "searchListOne" }).find_all('li')[:-1] # # if i.attrs['id'] == 'search_msg':存在这个li self.logger.warn(len(urls)) if self.count_page == 0: #只进入一次(计算分页数) self.count_page = 1 pages = soup.find('div', attrs={ 'class': "long-pages" }).find_all('a') for p in pages: pa = p.text.strip() self.pageList.append(pa) if str(page + 1) in self.pageList: hasnext = True else: hasnext = False for i in urls: urlTime = i.find('p', attrs={ 'class': "source" }).find('span').text.strip() urlTime = time.strptime(urlTime, "%Y-%m-%d %H:%M:%S") self.logger.error(urlTime) Y, M, D, H, S = urlTime[0:5] urlTime2 = datetime.datetime(Y, M, D, H, S) self.logger.error(urlTime2) if urlTime2 >= startTime and urlTime2 <= endTime: self.logger.error(u"时间比较") try: url = i.find('h3').find('a').attrs['href'] if url not in self.luntanUrl_list: self.luntanUrl_list.append(url) except: continue else: hasnext = False break return (self.luntanUrl_list, hasnext) def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' html = self.session.download(url, encoding='utf-8', data=None, isJson=False, timeout=10, retry=3, addr=True) article_url = html['url'] self.logger.debug(article_url) soup = BeautifulSoup(html['html'], 'html.parser') main = soup.find('div', attrs={'id': "bd"}) main1 = soup.find('div', attrs={ 'class': "wd-question" }) #论坛提问帖子http://bbs.tianya.cn/post-730-5795-1-1.shtml article = None if main: Ttitle = main.find('h1').find('span').text Ttime = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[1].text.strip() Ttime = re.sub(u'[\u4e00-\u9fa5]+:', '', Ttime) Tid = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[0].find('a').attrs['uid'].strip() Tauthor = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[0].find('a').attrs['uname'].strip() Tclick = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[2].text.strip() Tclick = re.sub(u'[\u4e00-\u9fa5]+:', '', Tclick) Tclick = int(Tclick) Treply = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[3].text.strip() Treply = re.sub(u'[\u4e00-\u9fa5]+:', '', Treply) Treply = int(Treply) Tlike = main.find('a', attrs={'class': "zantop"}).attrs['_count'] Tcontent = main.find('div', attrs={ 'class': "bbs-content clearfix" }).text.strip() article = Article(Tid, self.channel.channel_id, Ttitle, Tcontent, Ttime, url=article_url, author_id=None, author_name=Tauthor) article.statistics.reply_count = Treply article.statistics.click_count = Tclick article.statistics.like_count = Tlike elif main1: Ttitle = main1.find('h1').find('span').text Ttime = main1.find('div').attrs['js_replytime'] Ttime = re.findall(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', Ttime)[0] Tid = main1.find('div').attrs['_host'] Tauthor = main1.find('div', attrs={ 'class': "q-info" }).find('a').text Tclick = main1.find('div').attrs['js_clickcount'] Treply = main1.find('div').attrs['js_powerreply'] Tcontent = main1.find('div', attrs={'class': "q-content atl-item"}) if Tcontent: Tcontent = Tcontent.find('div', attrs={ 'class': "text" }).text.strip() else: Tcontent = '' article = Article(Tid, self.channel.channel_id, Ttitle, Tcontent, Ttime, url=article_url, author_id=None, author_name=Tauthor) article.statistics.reply_count = Treply article.statistics.click_count = Tclick return article #通过找到的url列表一个个访问文章 def __parseSearchPage(self, urllist): self.logger.info(u'这里在访问收集到的url,请稍等') for url in urllist: article = self.crawlArticle(url) if article not in self.articleList and article is not None: self.articleList.append(article) def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.lastCommentId = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") html = self.session.download(article.url, encoding='utf-8', data=None, isJson=False, timeout=10, retry=3) article_url = article.url soup = BeautifulSoup(html, 'html.parser') comments = soup.find_all( lambda tag: tag.name == 'div' and tag.get('class') == ['atl-item']) for i in comments: cid = i.attrs['replyid'] user_id = i.attrs['_hostid'] user_name = i.attrs['_host'] # user_head = i.find('div', attrs={'class': "atl-info"}).find('a').attrs['href'] #楼主name cpublish_datetime = i.attrs['js_restime'] reply_userid = '' # 评论父id like_count = i.find('a', attrs={'class': "zan"}).attrs['_count'] reply_count = i.find('div', attrs={ 'class': "atl-reply" }).find('a', attrs={ 'title': "插入评论" }).text.strip() reply_count = re.findall(r'\d+', reply_count) if reply_count: reply_count = reply_count[0] else: reply_count = 0 content = i.find('div', attrs={ 'class': "bbs-content" }).text.strip() location_coutry = 'CN' commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, cpublish_datetime, None, location_coutry, None, None, user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None)) return (commentList, False)