def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' commentList = list() comment_url = 'http://120.55.151.61/Treehole/V4/Message/getDetail.action' data = 'platform=1&phoneBrand=HONOR&phoneVersion=24&versionNumber=9.2.0&phoneModel=BLN-AL10&channel=huaweiMarket&plateId=1&messageId=' + str( article.tid) + '&' data_comment = self.s.post(url=comment_url, data=data, headers=self.headers) data0 = data_comment.text true = True false = False data_j0 = eval(data0) data_js0 = json.dumps(data_j0) data_dict0 = simplejson.loads(data_js0) try: comment_list = data_dict0['data']['commentListBO']['commentBOs'] except: return (commentList, False) if len(comment_list) == 0: return (commentList, False) add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") for comment in comment_list: cid = comment['commentId'] commentTimeStamp = comment['commentTime'] / 1000 datetime_struct = datetime.datetime.fromtimestamp(commentTimeStamp) commentTime = datetime_struct.strftime('%Y-%m-%d %H:%M:%S') commentContent = comment['content'] user_id = comment['student']['id'] user_name = comment['student']['nickName'] location_coutry = 'CN' commentList.append( Comment( article.tid, self.channel.channel_id, cid, add_datetime, commentTime, None, location_coutry, None, None, ###这里的ip_address还未实现 user_id, user_name, commentContent, None, None, None, None, dislike_count=None)) return (commentList, False)
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' html = self.session.download(article.url, encoding='utf-8', data=False, timeout=10, retry=3, addr=True) channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") commentList = list() page = 1 self.logger.info(article.url) try: while page < 30: data ={ 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': page, 'page_size': '20' } re_url = 'http://comment5.news.sina.com.cn/page/info' html1 = self.session.download(url=re_url, encoding='utf-8', data=data, timeout=10, retry=3, addr=True) html1 = html1["html"] html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) totalcount = html1['result']['count']['show'] if totalcount == 0: break cmntlist = html1["result"]["cmntlist"] for i in cmntlist: cid = i["mid"] user_id = i["uid"] user_name = i["nick"] user_ip = i["ip"] publish_datetime = i["time"] like_count = i["agree"] content = i["content"] commentList.append(Comment(article.tid, self.channel.channel_id, cid,add_datetime, publish_datetime, user_ip, None, None, None,user_id, user_name,content,None, None, like_count, None, None)) totalpage = math.ceil(totalcount / 20.0) if totalpage < page: break page = page + 1 except: self.logger.error(self.logger.error('Fail to parse comment:%s'+traceback.format_exc())) finally: return (commentList, False)
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' #self.logger.debug('Article:%s', article) comment_url = 'http://comment5.news.sina.com.cn/page/info?channel=blog&newsid=' comment_url = comment_url + article.meta_info self.logger.error(comment_url) html = self.session.get(comment_url) add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") commentList = list() main_comment = json.loads(html)['result']['cmntlist'] print '8989' if len(main_comment) == 0: print '12212' return (commentList, False) for comment in main_comment: cid = comment['mid'] publish_datetime = comment['time'] user_id = comment['uid'] user_name = comment['nick'] content = comment['content'] location = ','.join(jieba.cut(comment['area'])) location_region = location[:location.find(',')] location_city = location[location.find(',') + 1:] print location_city location_coutry = 'CN' commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, None, location_coutry, location_region, location_city, user_id, user_name, content, None, None, None, None, dislike_count=None)) return (commentList, False ) #测试的时候 article[0][222].content 可以取出第222条的评论内容
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' #self.logger.debug('Article:%s', article) html = self.session.get(article.url, headers=self.headers) add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") commentList = list() main_comment = BeautifulSoup(html,'lxml') try: commentList_html = main_comment.find('ul',attrs={'id':"comments"}) commentList_html_li = commentList_html.findAll('li') except: self.logger.error(u'该页面已被删除或则存在其他格式:%s',article.url) return (commentList, False) for comment in commentList_html_li: cid = comment.attrs['data-cid'] comment_main_wrap = comment.find('div',attrs={'class':"reply-doc content"}) user_id = comment_main_wrap.find('div',attrs={'class':"operation_div"}).attrs['id'] user_info_wrap = comment_main_wrap.find('div',attrs={'class':"bg-img-green"}) user_name = user_info_wrap.find('a').text.strip() publish_datetime = user_info_wrap.find('span').text.strip() content = comment_main_wrap.find('p').text.strip() reply_user_wrap = comment_main_wrap.find('div',attrs={'class':"reply-quote"}) if reply_user_wrap: reply_userid_wrap=reply_user_wrap.find('span',attrs={'class':"pubdate"}) reply_userid = reply_userid_wrap.find('a').attrs['href'] reply_userid = re.sub(r'\D', "", reply_userid) else: reply_userid = '' # like_count_wrap = comment_main_wrap.find('div',attrs={'class':"operation_div"}) # like_count = like_count_wrap.findAll('a')[1].text # like_count = re.sub(r'\D', "", like_count) #点赞数难获取不是页面自带的 like_count = None reply_count = None location_coutry = 'CN' commentList.append(Comment(article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, None,location_coutry,None,None, user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None )) return (commentList, False) #测试的时候 article[0][222].content 可以取出第222条的评论内容
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' if self.nextCommentUrl is None: curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0) curl = curl.replace('#', '%') else: curl = self.nextCommentUrl self.session.randomSleep() result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS) jo = json.loads(result) paging = jo['paging'] hasnext = not paging['is_end'] self.nextCommentUrl = paging['next'] dataList = jo['data'] add_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') commentList = list() for data in dataList: #self.logger.debug('[Zhihu]Comment data keys:%s', data.keys()) #self.logger.debug('[ZHIHU]Comment url for %s:%s', article.title, data['url']) publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time'])) comment = Comment(article.tid, article.channel_id, data['id'], add_datetime, publish_datetime, ip_address=None, location_country=None, location_region=None, location_city=None, author_id=data['author']['id'], author_name=data['author']['name'], content=data['content'], reply_author_id=None, read_count=None, like_count=data['voteup_count'], reply_count=data['comment_count'], dislike_count=None) commentList.append(comment) return (commentList, hasnext)
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' # return (list(), False) commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") headers = { 'Accept': '* / *', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content - Type': 'application / x - www - form - urlencoded', 'Host': 'weibo.com', 'Referer': '%s' % article.url[:article.url.find('?') + 1], 'refer_flag': '1001030103_ & type = comment', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' } data1 = {'ajwvr': '6', 'id': '%s' % article.tid, 'from': 'singleWeiBo'} re_url = 'https://weibo.com/aj/v6/comment/big' html = self.session.get(re_url, params=data1, headers=headers, cookies=self.__load_cookies_from_lwp( self.user_name)) jo = json.loads(html) data = jo['data']['html'] # self.logger.error( data) soup = BeautifulSoup(data, 'lxml') if soup.find('div', {'class': "WB_empty"}) is None: commentbox = soup.find('div', {'class': "list_box"}) root_commentlist = commentbox.find('div', {'node-type': "comment_list"}) root_commentlist2 = root_commentlist.find_all( 'div', {'node-type': "root_comment"}) count_comment = 1 self.logger.debug('root_commentlist:%d %s', len(root_commentlist2), article.url) for root_comment in root_commentlist2: if count_comment > 20: break # 爬取前20条评论 self.logger.error('count_comment:%d', count_comment) comment_id = root_comment.attrs['comment_id'] # 一级评论id list_con = root_comment.find('div', {'class': "list_con"}) firstcomentwrap = list_con.find('div', {'class': "WB_text"}) firstcoment = firstcomentwrap.text.strip() # 一级评论内容 useridwrap = firstcomentwrap.find('a') user_id = useridwrap.attrs['usercard'] # 一级评论者id user_id = re.findall(r'id=(\d+)', user_id)[0] user_name = useridwrap.test # 一级评论者name if user_name is None: user_name = ' ' publish_timediv = list_con.find('div', {'class': "WB_func clearfix"}) try: publish_time = self.parseDateTime( publish_timediv.findAll('div')[1].text) # 一级评论发布时间 except: continue # self.logger.error(publish_time) # if publish_time < datetime.datetime.now()-datetime.timedelta(hours=48): # break like_count_div = publish_timediv.findAll('div')[0] try: like_count_li = like_count_div.findAll('li')[3] like_count = like_count_li.findAll('em')[1].text except: like_count = 0 if u'赞' == like_count: like_count = 0 commentList.append( Comment(article.tid, self.channel.channel_id, comment_id, add_datetime, publish_time, None, None, None, None, user_id, user_name, firstcoment, None, None, like_count, None, dislike_count=None)) count_comment += 1 # 评论数计数 return (commentList, False)
def crawlComment(self, article): ''' 根据文章,爬去文章的评论,返回评论列表 :param article: :return:(commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' try: commentList = list() count = 0 if (article.url.find('question') != -1): if self.nextCommentUrl is None: curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0) curl = curl.replace('#', '%') else: curl = self.nextCommentUrl self.lastCrawlTime = time.time() self.randomSleep() result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS) jo = json.loads(result.text) paging = jo['paging'] hasnext = not paging['is_end'] self.nextCommentUrl = paging['next'] dataList = jo['data'] add_datetime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') for data in dataList: publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time'])) dr = re.compile(r'<[^>]+>', re.S) content = r"'%s'" % (str(dr.sub('', data['content']))) comment = Comment(article.tid, article.channel_id, data['id'], add_datetime, publish_datetime, ip_address=None, location_country=None, location_region=None, location_city=None, author_id=data['author']['id'], author_name=data['author']['name'], content=content, reply_author_id=None, read_count=None, like_count=data['voteup_count'], reply_count=data['comment_count'], dislike_count=None) count += 1 print count commentList.append(comment) if count > 500: print count hasnext = False break return (commentList, hasnext) if (article.url.find('zhuanlan') != -1): offset = 0 comment_url = 'https://zhuanlan.zhihu.com/api/posts/%s/comments?limit=10&offset=%d' % ( article.tid, offset) self.lastCrawlTime = time.time() self.randomSleep() response = self.session.get(comment_url, headers=CRAWL_COMMENT_HEADERS) if (response.text == '[]' or count > 500): return (list(), False) dataList = json.loads(response.text) add_datetime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') for data in dataList: publish_datetime = str(data['createdTime'])[0:19].replace( 'T', ' ') content = r"'%s'" % str(data['content']) comment = Comment(article.tid, article.channel_id, data['id'], add_datetime, publish_datetime, ip_address=None, location_country=None, location_region=None, location_city=None, author_id=data['author']['uid'], author_name=data['author']['name'], content=content, reply_author_id=None, read_count=None, like_count=data['likesCount'], reply_count=None, dislike_count=data['dislikesCount']) commentList.append(comment) print count count += 1 if count > 500: print count break offset += 10 return (commentList, False) except: self.logger.debug(traceback.format_exc()) return ( list(), False ) #不要返回None 会报nonetype has no method of len()错误 (sacrawler.py里)
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") page = 1 while page <= 30: data = {'pn': page} html = self.session.download(article.url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=True) article_url = article.url # print article_url soup = BeautifulSoup(html['html']) try: main = soup.find('div', attrs={'class': "left_section"}) main = main.find('div', attrs={'id': "j_p_postlist"}) except: self.logger.warn(u'很抱歉,该贴已被删除。%s', article_url) return (commentList, False) sectionsite = main.find_all('div', attrs={'class': "l_post"}) # self.logger.error(len(sectionsite)) index = 0 if main: com_all = main.find_all('div', attrs={'data-field': True}) for i in sectionsite[2:]: # self.logger.warn(i) index = index + 1 if com_all[index].attrs['data-field']: try: data_field = i.attrs['data-field'].strip() except: self.logger.error(u'存在未找到的data-field') self.logger.error(article_url) continue data_field = json.loads(data_field) if 'content' in data_field.keys(): # self.logger.warn(u'这里真的会不糊出错2') cid = data_field['content']['post_id'] user_id = data_field['author']['user_id'] user_name = data_field['author']['user_name'] # user_ip = '' # ip_address = '' # user_head = '' if 'date' in data_field['content'].keys(): # self.logger.warn(u'这里没有出错%s', article_url) cpublish_datetime = data_field['content'][ 'date'] else: # self.logger.warn(u'这里出错%s',article_url) cpublish_datetime = i.findAll('span') cpublish_datetime = cpublish_datetime[ -1].text.strip() if u'广告' in cpublish_datetime: continue cpublish_datetime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', cpublish_datetime)[0] if len(cpublish_datetime[4]) > 1: cpublish_datetime = cpublish_datetime[0] + '-' + cpublish_datetime[1] + '-' + \ cpublish_datetime[2] + ' ' + cpublish_datetime[3] + cpublish_datetime[4] else: cpublish_datetime = cpublish_datetime[0] + '-' + cpublish_datetime[1] + '-' + \ cpublish_datetime[2] + ' ' + cpublish_datetime[3] + ':00' # reply_userid = '' # like_count = # unlike_count = -1 # read_count = -1 reply_count = data_field['content']['comment_num'] source_url = article_url content = i.find('cc').text.strip() location_coutry = 'CN' # channeltype = 'tieba' # channel = self.site # heat = 0 commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, cpublish_datetime, None, location_coutry, None, None, user_id, user_name, content, None, None, None, reply_count, dislike_count=None)) # 翻页控制 pages = soup.find('li', attrs={'class': "l_pager"}).find_all('a') pageList = [] for p in pages: pa = p.text.strip() pageList.append(pa) if str(page + 1) in pageList: page += 1 else: break return (commentList, False)
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420' html = self.session.download(article.url, encoding='gbk', data=None, isJson=False, timeout=10, retry=3, addr=True, cookies=cookies) article_url = article.url soup = BeautifulSoup(html['html'], 'html.parser') try: sid = soup.find('meta', attrs={ 'name': "contentid" }).attrs['content'] except: return (list(), False) sid = re.sub(r'\D', '', sid) bbs = 'http://bbs1.people.com.cn/postLink.do?nid=' + sid # bbs = soup.find('div', attrs={'class': "message"}) # if bbs: # bbs = bbs.find('a') # if bbs: # bbs = bbs.attrs['href'] # else: # bbs = 'http://bbs1.people.com.cn/postLink.do?nid=' # print bbs # else: # return None commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") html1 = self.session.download(bbs, encoding='gbk', data=None, isJson=False, timeout=10, retry=3) soup1 = BeautifulSoup(html1, 'html.parser') id = soup1.find('meta', attrs={'name': "contentid"}) if id: id = id.attrs['content'] id = re.sub(r'\D', '', id) re_url = 'http://bbs1.people.com.cn/api/postApi.do' page = 1 while page < 30: data1 = { 'action': 'postDetailByParentId', 'replayPostId': id, 'pageNo': page } html2 = self.session.download(re_url, encoding='utf-8', data=data1, isJson=False, timeout=10, retry=3) html2 = re.sub(r'\\\\\\', '', html2) html2 = re.sub(r'"\[\\"', '[', html2) html2 = re.sub(r'\\"\]"', ']', html2) html2 = re.sub(r'\\",\\"', ',', html2) html2 = json.loads(html2) totalCount = html2['totalCount'] if totalCount == 0: break replayPosts = html2['replayPosts'] if replayPosts: for i in replayPosts: cid = i['id'] user_id = i['userId'] user_name = i['userNick'] user_ip = i['userIP'] # ip_address = get_ip_address(str(user_ip)) # ip_address = '' user_head = '' publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(i['createTime']) / 1000)) reply_userid = i['parentId'] like_count = i['vote_yes'] unlike_count = i['vote_no'] read_count = i['readCount'] reply_count = i['replyCount'] source_url = article_url content = i['contentText'] heat = 0 location_coutry = 'CN' # print cid, user_id, user_name, user_ip, ip_address, user_head, publish_datetime, reply_userid # print like_count,unlike_count,read_count,reply_count,source_url commentList.append( Comment( article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, location_coutry, None, None, ###这里的ip_address还未实现 user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None)) pageCount = html2['pageCount'] # 评论总页数 if pageCount == page: break page = page + 1 # 评论页数+1 else: break return (commentList, False)
def crawlComment(self, article): # 获取文章评论 meta_info = article.meta_info if meta_info is None: return (list(), False) jo = json.loads(meta_info) if "commentid" not in jo: return (list(), False) commentid = jo["commentid"] cookies = 'pac_uid=0_58ec8106620c1; gj_mpvid=80515918; ad_play_index=97; dsp_cookiemapping0=1492586667155; pgv_info=ssid=s9259450720; ts_last=news.qq.com/a/20170415/002007.htm; ts_refer=www.baidu.com/link; pgv_pvid=1281052383; ts_uid=1143064466; ptag=www_baidu_com|' re_url = 'http://coral.qq.com/article/' + commentid + '/comment' commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") data1 = { 'commentid': self.lastCommentId if self.lastCommentId is not None else '0', 'reqnum': '50' } html = self.session.download(re_url, encoding='utf-8', cookies=cookies, data=data1, timeout=10, retry=3) jo = json.loads(html) if jo['errCode'] != 0: return ([], False) if jo['data']['retnum'] == 0: return ([], False) self.lastCommentId = jo['data']['last'] for i in jo['data']['commentid']: cid = i['id'] user_id = i['userinfo']['userid'] user_name = i['userinfo']['nick'] user_ip = '' location = i['userinfo']['region'].replace(u'市', '').replace( u'自治', '').replace(u'新区', '').replace(u'区', '').replace(u'洲', '') location_list = location.split(':') location_country = location_list[0] if len(location_list) > 1: location_region = location_list[1] else: location_region = '' if len(location_list) > 2: location_city = location_list[2] else: location_city = '' #user_head = i['userinfo']['head'] publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(i['time'])) reply_userid = str(i['replyuserid']) like_count = i['up'] reply_count = i['rep'] content = i['content'] # print cid, user_id, user_name, user_ip, ip_address, user_head, publish_datetime, reply_userid # print like_count,unlike_count,read_count,reply_count,source_url commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, location_country, location_region, location_city, user_id, user_name, content, reply_userid, None, like_count, reply_count, None)) return (commentList, jo['data']['hasnext'])
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' #self.logger.debug('Article:%s', article) html = self.session.download(article.url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, isJson=False) # meta_info = article.meta_info add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # add_datetime = time.mktime(time.strptime('','%Y-%m-%d')) commentList = list() page = 1 while page < 30: doc_url = re.findall(r'"commentUrl":"(.*)",', html['html']) if doc_url: doc_url = doc_url[0] else: doc_url = article.url data1 = { 'callback': 'newCommentListCallBack', 'orderby': '', 'docUrl': doc_url, 'job': '1', 'p': page, 'callback': 'newCommentListCallBack' } re_url = 'http://comment.ifeng.com/get.php' html1 = self.session.download(re_url, encoding='gbk', data=data1, timeout=10, retry=3, isJson=True) totalcount = html1['count'] # 评论总数 if totalcount == 0: break comments = html1['comments'] if comments: for comment in comments: cid = comment['comment_id'] user_id = comment['user_id'] user_name = comment['uname'] user_ip = comment['client_ip'] # ip_address = get_ip_address(self, str(user_ip)) # 并没有获取到值 # if ip_address is '': try: ip_address = comment['ip_from'] except: ip_address = None # ip_address = comment['ip_from'] user_head = comment['user_url'] publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(comment['create_time']))) reply_userid = comment['parent'] # 评论的回复 if reply_userid: reply_userid = comment['parent'][0]['user_id'] else: reply_userid = '' like_count = comment['uptimes'] unlike_count = None read_count = None reply_count = None source_url = article.url content = comment['comment_contents'] heat = 0 location_coutry = 'CN' if ip_address is None: commentList.append( Comment( article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, None, None, None, ###这里的ip_address还未实现 user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None)) else: try: location_region = ip_address[:ip_address. find(u'省') + 1] location_city = ip_address[ip_address.find(u'省') + 1:] except: location_region = None location_city = None commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, location_coutry, location_region, location_city, user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None)) page = page + 1 totalpage = math.ceil(totalcount / 20.0) # 计算评论总页数,向上取整 if totalpage < page: break else: break return (commentList, False ) #测试的时候 article[0][222].content 可以取出第222条的评论内容
def crawlComment(self, article): ''' :param article:文章的实例 :return:返回(commentList,hasnext)commentList是文章的评论,hasnext代表是否继续爬虫 ''' html = self.session.get(article.url, headers=self.headers, verify=False) soup = BeautifulSoup(html, 'html.parser') commentList = list() add_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") try: div = soup.find('div', attrs={'class': 'normal-comment-list'}) divs = div.findAll('div', attrs={'class': 'comment'}) except: self.logger.error(u'该页面已经被删除:%s', article.url) #print('该文章没有评论') return (commentList, False) for c in divs: pre_cid = c.attrs['id'] cid = re.sub('\D', '', pre_cid) #评论的id user_id = None name_div = c.find('div', attrs={'class': 'info'}) user_name = name_div.find('a').text #评论者的名字 pre_time = name_div.find('div', attrs={'class': 'meta'}) publish_time = pre_time.find('span').text.strip() #评论的时间 publish_time = publish_time[-16:] pre_content = c.find('div', attrs={'class': 'comment-wrap'}) content = pre_content.find('p').text reply_user_wrap = c.find('div', attrs={'class': 'sub-comment-list'}) if reply_user_wrap: pre_reply_user_id = reply_user_wrap.find( 'div', attrs={'class': 'sub-comment'}) reply_user_id = re.sub('\D', '', pre_reply_user_id) else: reply_user_id = ' ' like_count = None reply_count = None location_coutry = 'CN' commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_time, publish_time, None, location_coutry, None, None, user_id, user_name, content, reply_user_id, None, like_count, reply_count, dislike_count=None)) return (commentList, False ) #测试的时候 article[0][222].content 可以取出第222条的评论内容
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056' if len(article.tid) != 16: articleid = article.tid[3:len(article.tid) - 2] else: articleid = article.tid re_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid + '/comments/newList' commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") page = 0 while page < 750: data1 = { 'offset': page, 'limit': 30, 'showLevelThreshold': 72, 'headLimit': 1, 'tailLimit': 2, 'ibc': 'newspc' } try: html1 = self.session.download(url=re_url, encoding='utf-8', cookies=cookies, data=data1, timeout=10, retry=3) html1 = json.loads(html1) totalcount = html1["newListSize"] if totalcount == 0: break for i in html1['comments'].itervalues(): cid = i['commentId'] user_id = i['user']['userId'] if user_id == 0: user_name = '' else: user_name = i['user']['nickname'] user_ip = '' location = i['user']['location'].replace( u'市', ':').replace(u'自治', ':').replace(u'新区', ':').replace( u'区', ':').replace(u'洲', ':') location_list = location.split(':') location_country = location_list[0] if len(location_list) > 1: location_region = location_list[1] else: location_region = '' if len(location_list) > 2: location_city = location_list[2] else: location_city = '' publish_datetime = i['createTime'] like_count = i['vote'] unlike_count = i['against'] content = i['content'] dr = re.compile(r'<[^>]+>', re.S) content = dr.sub('', i['content']) commentList.append( Comment(articleid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, location_country, location_region, location_city, user_id, user_name, content, None, None, like_count, None, unlike_count)) # print page, totalcount if page > int(totalcount): break page = page + 30 except: self.logger.error('[NeteaseComment]url:' + article.url + ', tid:' + article.tid + ', %s' + traceback.format_exc()) return finally: return (commentList, False)
def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") html = self.session.download(article.url, encoding='utf-8', data=None, isJson=False, timeout=10, retry=3) article_url = article.url soup = BeautifulSoup(html, 'html.parser') comments = soup.find_all( lambda tag: tag.name == 'div' and tag.get('class') == ['atl-item']) for i in comments: cid = i.attrs['replyid'] user_id = i.attrs['_hostid'] user_name = i.attrs['_host'] # user_head = i.find('div', attrs={'class': "atl-info"}).find('a').attrs['href'] #楼主name cpublish_datetime = i.attrs['js_restime'] reply_userid = '' # 评论父id like_count = i.find('a', attrs={'class': "zan"}).attrs['_count'] reply_count = i.find('div', attrs={ 'class': "atl-reply" }).find('a', attrs={ 'title': "插入评论" }).text.strip() reply_count = re.findall(r'\d+', reply_count) if reply_count: reply_count = reply_count[0] else: reply_count = 0 content = i.find('div', attrs={ 'class': "bbs-content" }).text.strip() location_coutry = 'CN' commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, cpublish_datetime, None, location_coutry, None, None, user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None)) return (commentList, False)