class NewsComment(object): def __init__(self): self.mongo = MongoDB() def run(self, news_url, _id, page): # comment_url = 'http://apiv2.sohu.com/api/topic/load?page_size=10' \ # '&topic_source_id=%s&page_no=1&hot_size=5&topic_url=%s&source_id=%s' % (_id,news_url,_id) if news_url.endswith('shtml'): pass else: tow_ids = news_url.split('/')[-1].split('_') media_id = tow_ids[1] source_id = tow_ids[0] comment_url = 'http://apiv2.sohu.com/api/topic/load?callback=jQuery1124008187733188312629_1539945526218&page_size=10' \ '&topic_source_id=%s&page_no=1&media_id=%s&source_id=mp_%s' % (_id, media_id, source_id) # print comment_url json_object = dict() flag = 1 while 1: try: # json_object = json.loads(requests.get(comment_url, timeout=30).content) comments = requests.get(comment_url, timeout=30).content json_object = json.loads( re.match('.*218\((.*?)\);', comments).group(1)) break except Exception as e: flag += 1 print "获取评论错误:", e if flag > 5: return count = 0 if (json_object[u'jsonObject'].has_key(u'topic_id') == False): print "暂时无法获取topic_id" else: item = json_object[u'jsonObject'][u'topic_id'] # comment_URL = 'http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=%s&page_no=%d&source_id=%s' % (item, page,_id) comment_URL = 'http://apiv2.sohu.com/api/topic/load?callback=jQuery1124008187733188312629_1539945526218&page_size=10' \ '&topic_id=%s&page_no=%s&media_id=%s&source_id=mp_%s' % (item, page, media_id, source_id) Json_object = dict() comment_dict = dict() flag = 1 while 1: try: # json_object = json.loads(requests.get(comment_url, timeout=30).content) comments = requests.get(comment_URL, timeout=30).content Json_object = json.loads( re.match('.*218\((.*?)\);', comments).group(1)) break except Exception as e: flag += 1 print "获取评论错误:", e if flag > 5: return count = 0 for item in Json_object[u'jsonObject'][u'comments']: # 评论文章url news_url = news_url # 评论内容 ping_lun_nei_rong = item["content"] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = item["create_time"] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = item["reply_count"] comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = item["support_count"] comment_dict['dian_zan_shu'] = dian_zan_shu # 评论id ping_lun_id = item["comment_id"] comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 if (item[u'passport'].has_key(u'nickname') == False): yong_hu_ming = None else: yong_hu_ming = item[u'passport']["nickname"] comment_dict['yong_hu_ming'] = yong_hu_ming # 性别 xing_bie = None comment_dict['xing_bie'] = xing_bie # 用户等级 yong_hu_deng_ji = None comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji # 用户省份 yong_hu_sheng_fen = item["ip_location"] comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'搜狐网' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = str(ping_lun_id) + '|_|' + news_url # count += 1 # print json.dumps(comment_dict, ensure_ascii=False, indent=4) self.mongo.put_comment(comment_dict)
class MyCommentThread(threading.Thread): def __init__(self, workqueue): threading.Thread.__init__(self) self.workQueue = workqueue self.setDaemon(True) self.start() self.mongodb = MongoDB() self.checkMongoDB = TempMongoDB() def run(self): while not self.workQueue.empty(): try: # print "%s start working" % self.name info, wenzhang_Url = self.workQueue.get() default_url = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/' \ 'comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2' % \ (info.group(1), info.group(2)) pages = self.working(wenzhang_Url, default_url, info) if pages > 0: comment_urls = list() for i in range(1, pages + 1): offset = i * 30 temp = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s' \ '/comments/newList?offset=%d&limit=30&showLevelThreshold=72&headLimit=1&' \ 'tailLimit=2' % (info.group(1), info.group(2), offset) comment_urls.append(temp) for item in comment_urls: drop = self.working(wenzhang_Url, item, info) except Exception as e: print "ERROR: Locate in the CommentThread's run method 'while not Queue empty', exception: %s" % e continue def working(self, content_url, the_comment_url, info): host = 'comment.%s.163.com' % (info.group(1)) referer = the_comment_url header = { 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Host': host, 'Referer': referer, 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/59.0.3071.115 Safari/537.36' } # 默认获取第一页的json数据 flag = 1 while 1: try: json_data = json.loads(requests.get(url=the_comment_url, headers=header, timeout=30).content) break except Exception as e: print "ERROR: Failed to get the comment's json, exception: %s" % e flag += 1 if flag > 5: return pages = 0 try: for comment_id in json_data['comments']: comment_dict = dict() # 评论内容 ping_lun_nei_rong = json_data['comments'][str(comment_id)]['content'] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = json_data['comments'][str(comment_id)]['createTime'] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = None comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = json_data['comments'][str(comment_id)]['vote'] comment_dict['dian_zan_shu'] = dian_zan_shu # 评论ID ping_lun_id = comment_id comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 try: yong_hu_ming = json_data['comments'][str(comment_id)]['user']['nickname'] comment_dict['yong_hu_ming'] = yong_hu_ming except Exception as e: comment_dict['yong_hu_ming'] = None # 性别 comment_dict['xing_bie'] = None # 用户等级 comment_dict['yong_hu_deng_ji'] = None # 用户省份 comment_dict['yong_hu_sheng_fen'] = json_data['comments'][str(comment_id)]['user']['location'] # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'网易新闻' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = ping_lun_id + content_url # 获取评论数 ping_lun_shu = json_data['newListSize'] pages = ping_lun_shu / 30 self.mongodb.put_comment(comment_dict) # put the data into the database check_dict = dict() check_dict['_id'] = content_url check_dict['do_time'] = do_time check_dict['ping_lun_shu'] = ping_lun_shu self.checkMongoDB.put(check_dict) return pages except Exception as e: print "ERROR: Locate in the CommentThread's working method for parsing json data, exception: %s," \ "and json data is %s" % (e, json_data)
class NewsComment(object): def __init__(self): self.mongo = MongoDB() def run(self, news_url, page): bu = re.split(r'c_|.htm', news_url)[1] comment_url = 'http://comment.home.news.cn/a/newsCommAll.do?&newsId=1-%s&pid=%d' % ( bu, page) json_object = dict() comment_dict = dict() flag = 1 while 1: try: json_object = json.loads( requests.get(comment_url, timeout=30).content.replace( 'var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print "获取评论错误:", e if flag > 5: return for item in json_object['contentAll']: # 评论文章url news_url = news_url # 评论内容 ping_lun_nei_rong = item["content"] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = item["commentTime"] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = None comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = item["upAmount"] comment_dict['dian_zan_shu'] = dian_zan_shu # 评论id ping_lun_id = item["userId"] comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 yong_hu_ming = item["nickName"] comment_dict['yong_hu_ming'] = yong_hu_ming # 性别 xing_bie = None comment_dict['xing_bie'] = xing_bie # 用户等级 yong_hu_deng_ji = None comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji # 用户省份 yong_hu_sheng_fen = item["ipInfo"] comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'新华网' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = str(ping_lun_id) + news_url #print json.dumps(comment_dict, ensure_ascii=False, indent=4) self.mongo.put_comment(comment_dict)
class NewsComment(object): def __init__(self): self.mongo = MongoDB() def run(self, news_url, page): comment_url = 'http://comment.ifeng.com/get.php?docUrl=%s&format=js&job=1&p=%d&pageSize=20' % ( news_url, page) json_object = dict() comment_dict = dict() flag = 1 while 1: try: json_object = json.loads( requests.get(comment_url, timeout=30).content.replace( 'var commentJsonVarStr___=', '')[:-1]) break except Exception as e: flag += 1 print "获取评论错误:", e if flag > 3: return for item in json_object['comments']: # 评论文章url news_url = news_url # 评论内容 ping_lun_nei_rong = item["comment_contents"] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = item["create_time"] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = None comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = None comment_dict['dian_zan_shu'] = dian_zan_shu # 评论id ping_lun_id = item["comment_id"] comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 yong_hu_ming = item["uname"] comment_dict['yong_hu_ming'] = yong_hu_ming # 性别 xing_bie = None comment_dict['xing_bie'] = xing_bie # 用户等级 yong_hu_deng_ji = None comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji # 用户省份 yong_hu_sheng_fen = item["ip_from"] comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'凤凰网' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = ping_lun_id + news_url # print json.dumps(comment_dict, ensure_ascii=False, indent=4) self.mongo.put_comment(comment_dict) pass