def get_Index(self): url_to_get_index1 = self.urls[0] for i in range(1, 900): response1 = get_response_and_text(url=url_to_get_index1 + str(i) + '&size=20', headers=self.headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] datajson = json.loads(response_in_function_text) this_url_index_list = [] #专为获取评论浏览数量而设计 for i in datajson: url_index = 'https://m.sohu.com/a/' + str(i['id']) + '_' + str( i['authorId']) publish_time = i['publicTime'] publish_time = int(publish_time) / 1000 time_format = '%Y-%m-%d %H:%M:%S' publish_time_stamp_9 = time.localtime(float(publish_time)) publish_time = time.strftime(time_format, publish_time_stamp_9) data_index = { 'publish_user': i['authorName'], 'title': i['title'], 'publish_time': publish_time, 'id': i['id'], 'url': url_index, 'cmsid': i['cmsId'], 'spider_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } this_url_index_list.append(data_index) # self.content_data_list.append(data_index)#因为获得浏览量是单独的一个请求,所以 # break viewernum_url = 'https://v2.sohu.com/public-api/articles/pv?articleIds=' for viewernum_id in this_url_index_list: viewernum_url = viewernum_url + ',' + str(viewernum_id['id']) viewernum_url = viewernum_url.replace('articleIds=,', 'articleIds=') viewernum_info = requests.get(url=viewernum_url, headers=self.headers) viewernum_info_json = json.loads(viewernum_info.text) for data_index_no_viewer in this_url_index_list: noviewer_id = data_index_no_viewer['id'] print noviewer_id data_index_no_viewer['read_count'] = viewernum_info_json[ '%s' % (str(noviewer_id))] self.content_data_list = self.content_data_list + this_url_index_list # break time.sleep(1) self.global_status_num_index = 0
def get_comment_comment(data1): # 评论中有评论,起名data1是为了防止覆盖data变量 id = data1['id'] # session1 = requests.session() headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } try: comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str( id) + '&item_id=' + str(id) + '&offset=0&count=20' response1 = get_response_and_text(url=comment_url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] datajson = json.loads(response_in_function_text) # break except Exception as e: print e reply_nodes = [] # datajson=json.loads(response_in_function.text) datajson = json.loads(response_in_function_text) for one_comment in datajson['data']['data']: content = one_comment['text'] like_count = one_comment['digg_count'] publish_time = one_comment['create_time'] publish_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time))) publish_user_id = one_comment['user']['user_id'] publish_user = one_comment['user']['screen_name'] publish_user_photo = one_comment['user']['avatar_url'] id = one_comment['id'] try: ancestor_id = data1['ancestor_id'] except Exception as e: print e ancestor_id = 'wrong' parent_id = data1['id'] thisnode = { 'publish_user': publish_user, 'content': content, 'like_count': like_count, 'publish_time': publish_time, 'publish_user_id': publish_user_id, 'publish_user_photo': publish_user_photo, 'id': id, 'ancestor_id': ancestor_id, 'parent_id': parent_id, # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } reply_nodes.append(thisnode) return reply_nodes
def get_content_inside_next_page(data): url = data['nexturl'] content = data['content'] img_urls = data['img_urls'] headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } response1 = get_response_and_text(url=url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] datasoup = BeautifulSoup(response_in_function_text, 'lxml') content1 = '' for i in datasoup.select( 'body > div.scrollBox.mt10 > div.article > div.art_co.sau > p' ): content1 += i.text content += content1 # 8-3 Re_find_img_url = re.compile(r'src=".*?"/\>') content_part_data = datasoup.select('div.article') if content_part_data: data_find_by_re = Re_find_img_url.findall( str(content_part_data[0])) img_urls2 = [] for url_img_re in data_find_by_re: imgurl = url_img_re.split('"')[1] img_urls2.append(imgurl) for url_without_http in img_urls2: if 'http' not in url_without_http: url_without_http = 'http:' + url_without_http img_urls.append(url_without_http) pass # 8-3 next_page_selector = datasoup.select( 'body > div.scrollBox.mt10 > div.article > div.mb10.mt5.fs14 > a.page-next.ml5' ) if next_page_selector: next_page_html = next_page_selector[0].get('href') if next_page_html and len(next_page_html) > 3: next_page_url = next_page_html next_url = 'http://m.xilu.com' + next_page_url content_result = get_content_inside_next_page({ 'content': content, 'nexturl': next_url, 'img_urls': img_urls }) return content_result else: return {'content': content, 'img_urls': img_urls}
def get_content_inside(data): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } url_for_debug = data['url'] response1 = get_response_and_text(url=url_for_debug, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] img_urls = [] content = '' datasoup = BeautifulSoup(response_in_function_text, 'lxml') if datasoup.select('#articleContent > div.display-content > p'): for i in datasoup.select( '#articleContent > div.display-content > p'): content += i.text else: for i in datasoup.select( '#articleContent > div.display-content'): content += i.text try: content_data = str(datasoup.select('#articleContent')[0]) except Exception as e: print e try: content_data = str(datasoup.select('#articleContent')[0]) except: return Re_find_img = re.compile(r'src=".*?"') imgs_find_by_re = Re_find_img.findall(content_data) for img_url in imgs_find_by_re: img_url = img_url.split('"')[1] if 'http' not in img_url: img_url = 'https:' + img_url img_urls.append(img_url) data['content'] = content data['img_urls'] = img_urls data['reply_nodes'] = [] data['spider_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') self.comments_url_list.append(data)
def get_Index(self): while True: for url_to_get_index in self.urls: for i in range(1): try: response1 = get_response_and_text(url=url_to_get_index) response_in_function = response1[ 'response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] response_text = response_in_function_text.decode( 'utf-8') datajson = json.loads(response_text) datajson_index_data = datajson['data'] for one_index in datajson_index_data: try: title = one_index['title'] except: title = '' try: reply_count = int(one_index['comments_count']) except: reply_count = 0 url = 'http://www.toutiao.com' + one_index[ 'source_url'] try: publish_user = one_index['source'] # publisher except: publish_user = '' try: publish_user_photo = one_index[ 'media_avatar_url'] if 'http' not in publish_user_photo: publish_user_photo = 'http:' + publish_user_photo except: publish_user_photo = '' try: vedio_id = one_index['video_id'] except: vedio_id = None try: is_ad = one_index['label'] except: is_ad = False if vedio_id: continue # 如果是视频,直接舍弃 if is_ad == u'广告': continue id = one_index['group_id'] dict1 = { 'id': id, 'url': url, 'reply_count': reply_count, 'title': title, 'publish_user': publish_user, 'publish_user_photo': publish_user_photo, 'spider_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') } self.content_data_list.append(dict1) # break # redis1.lpush('toutiao_index',dict(dict1)) except Exception as e: pass
def get_comment_inside(data): # session1 = requests.session() headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } while True: # 强制请求 try: print data comment_url = 'http://www.toutiao.com/api/comment/list/?group_id=' + str( data['id']) + '&item_id=' + str( data['item_id']) + '&offset=0&count=20' response1 = get_response_and_text(url=comment_url) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] break except Exception as e: print e, 'mark1' if 'item_id' in e: messege = {'msg': e.message} logger_toutiao.log(msg=messege, level=logging.WARNING) comments_data = [] try: data_json = json.loads(response_in_function_text) except Exception as e: print e for one_comment in data_json['data']['comments']: content = one_comment['text'] like_count = one_comment['digg_count'] publish_time = one_comment['create_time'] publish_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time))) publish_user_photo = one_comment['user']['avatar_url'] publish_user_id = one_comment['user']['user_id'] publish_user = one_comment['user']['name'] #8-17日改 id = one_comment['id'] reply_count = one_comment['reply_count'] parent_id = data['id'] ancestor_id = data['id'] if reply_count > 0: reply_nodes = get_comment_comment({ 'id': id, 'ancestor_id': data['id'] }) else: reply_nodes = [] thisnode = { 'content': content, 'like_count': like_count, 'publish_time': publish_time, 'publish_user_photo': publish_user_photo, 'publish_user_id': publish_user_id, 'publish_user': publish_user, 'id': id, 'reply_count': reply_count, 'reply_nodes': reply_nodes, 'parent_id': parent_id, 'ancestor_id': ancestor_id, # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } # data['reply_nodes'].append(thisnode) comments_data.append(thisnode) # 这里的评论能获取的就20个,所以不设计下一页,评论中的评论也是不设计下一页 data['reply_nodes'] = comments_data while len(self.result_list) > 600: time.sleep(1) print 'is waiting the lenth of the result_list to decrease to 300' self.result_list.append(data)
def get_content_in_wenda_comments_more(id_replynodes, data=None): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } try: if not id_replynodes['next_comment_url']: url_comments_more = 'https://www.wukong.com/wenda/web/question/loadmorev1/?count=10&qid=' + \ id_replynodes['id'] + '&offset=10&req_type=1' response1 = get_response_and_text(url=url_comments_more, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] else: response1 = get_response_and_text( url=id_replynodes['next_comment_url'], headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] except Exception as e: print e datajson = json.loads(response_in_function_text) for one_comment in datajson['data']['ans_list']: datasoup_content = BeautifulSoup(one_comment['content'], 'lxml') content = datasoup_content.text img_urls = [] Re_find_img = re.compile(r'src=".*?"') img_urls_find_by_re = Re_find_img.findall( one_comment['content']) for img_url in img_urls_find_by_re: img_url_split = img_url.split('"')[1] img_urls.append(img_url_split) like_count = one_comment['digg_count'] id = one_comment['ansid'] publish_time = one_comment['create_time'] # 时间戳mark publish_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time))) reply_count = one_comment['comment_count'] publish_user_photo = one_comment['user']['avatar_url'] publish_user = one_comment['user']['uname'] publish_user_id = one_comment['user']['user_id'] reply_nodes = get_content_in_wenda_comments_comments({ 'id': id, 'reply_nodes': [], 'next_comment_url': None }) parent_id = id_replynodes['id'] ancestor_id = data['id'] this_node = { 'publish_time': publish_time, 'content': content, 'like_count': like_count, 'id': id, 'reply_count': reply_count, 'publish_user_photo': publish_user_photo, 'publish_user': publish_user, 'publish_user_id': publish_user_id, 'reply_nodes': reply_nodes, 'ancestor_id': ancestor_id, 'parent_id': parent_id, # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } id_replynodes['reply_nodes'].append(this_node) if datajson['data']['has_more']: url_offset = response_in_function.url.split('&offset=') offset = int(url_offset[1].split('&')[0]) + 10 url = url_offset[0] + '&offset=' + str(offset) id_replynodes['next_comment_url'] = url reply_nodes2 = get_content_in_wenda_comments_more( id_replynodes) return reply_nodes2 else: return id_replynodes['reply_nodes']
def get_content_in_wenda_comments_comments(id_replynodes, data=None): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } try: if not id_replynodes['next_comment_url']: url_comments_more = 'https://www.wukong.com/wenda/web/comment/brow/?ansid=' + \ id_replynodes['id'] + '&count=10&offset=0' response1 = get_response_and_text(url=url_comments_more) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] else: response1 = get_response_and_text( url=id_replynodes['next_comment_url'], headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] # break except Exception as e: print e datajson_comment2 = json.loads(response_in_function_text) try: datajson_comment2['comments'] except Exception as e: print e for comment2 in datajson_comment2['comments']: id = comment2['comment_id'] like_count = comment2['digg_count'] content = comment2['content'] publish_user_id = comment2['user_info']['user_id'] publish_user = comment2['user_info']['uname'] publish_user_photo = comment2['user_info']['avatar_url'] publish_time = comment2['create_time'] publish_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time))) ancestor_id = id_replynodes['ancestor_id'] parent_id = id_replynodes['id'] thisnode = { 'id': id, 'like_count': like_count, 'content': content, 'publish_user_id': publish_user_id, 'publish_user': publish_user, 'publish_user_photo': publish_user_photo, 'publish_time': publish_time, #发布时间 'parent_id': parent_id, 'ancestor_id': ancestor_id, # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } id_replynodes['reply_nodes'].append(thisnode) if datajson_comment2['has_more']: url_offset = response_in_function.url.split('&offset=') offset = int(url_offset[1].split('&')[0]) + 10 url = url_offset[0] + '&offset=' + str(offset) id_replynodes['next_comment_url'] = url reply_nodes2 = get_content_in_wenda_comments_comments( id_replynodes) return reply_nodes2 else: return id_replynodes['reply_nodes']
def get_content_inside(data): url = data['url'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } response1 = get_response_and_text(url=url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] real_url = response_in_function.url if 'toutiao' not in real_url: logger_toutiao.log(level=logging.WARNING, msg='toutiao was not in thisurl---------' + real_url) return elif 'http://www.toutiao.com/api/pc/subject/' in real_url: logger_toutiao.log( level=logging.WARNING, msg= 'http://www.toutiao.com/api/pc/subject/ was in thisurl----------' + real_url) return else: url = real_url Re_find_chineseTag = re.compile(r"chineseTag: '.*?'") ####################################################### chineseTag = Re_find_chineseTag.findall(response_in_function_text) if chineseTag: try: # print 'the lenth of response-------',len(response_in_function_text) chineseTag = chineseTag[0].split("'")[1] if chineseTag == '图片': content_time_img = get_content_picture({ 'response_in_function': response_in_function, 'response_in_function_text': response_in_function_text }) elif chineseTag == '问答': content_time_img = get_content_wenda(htmldata={ 'response_in_function': response_in_function, 'response_in_function_text': response_in_function_text, 'data': data }, data=data) return else: content_time_img = get_content_news({ 'response_in_function': response_in_function, 'response_in_function_text': response_in_function_text }) except Exception as e: print e, '在找图片,问答等分类模块的时候出了问题' logger_toutiao.log(level=logging.WARNING, msg={ 'where': '在找板块分类定位的时候除了问题', 'content': e.message }) else: logger_toutiao.log(level=logging.WARNING, msg=chineseTag) print chineseTag return #如果不是问答,那么就进入到这里边 Re_find_itmeId = re.compile(r'itemId: \'.*?\'') # 普通头条 Re_find_itme_Id = re.compile(r'item_id:\'.*?\'') # 图片 if Re_find_itmeId.findall(response_in_function_text): try: item_id = Re_find_itmeId.findall( response_in_function_text)[0].split("'")[1] except Exception as e: logger_toutiao.log(level=logging.WARNING, msg={ 'where': 'itemid来split失败了', 'contetn': Re_find_itmeId.findall( response_in_function_text)[0] }) print e, 'itemid在re中找到了,但是split失败了' else: try: item_id = Re_find_itme_Id.findall( response_in_function_text)[0].split("'")[1] except Exception as e: print e, '在item——id中没找到值,图片的item_id' msg = { 'errormsg': e.message + '在item——id中没找到值,图片的item_id', 'htmldata': response_in_function_text, 'url': response_in_function.url, 'code': response_in_function.code, 'msg': response_in_function.msg } logger_toutiao.log(level=logging.WARNING, msg=msg) return try: data['img_urls'] = content_time_img['img_urls'] data['content'] = content_time_img['content'] if len(content_time_img['publish_time']) < 12: data['publish_time'] = content_time_img[ 'publish_time'] + ' 00:00:00' else: data['publish_time'] = content_time_img['publish_time'] data['item_id'] = item_id data['reply_nodes'] = [] except Exception as e: print e, 'data合成的时候除了问题' self.comments_url_list.append(data)
def get_comment_inside(data): headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } comment_url = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cysYw3AKM&page_size=30&hot_size=10&topic_source_id=' + \ data['id'] response1 = get_response_and_text(url=comment_url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] comments_data = [] data_json = json.loads(response_in_function_text) reply_count_outside = data_json['cmt_sum'] if data_json['comments']: data_json_comments = data_json['comments'] for someone_comment in data_json_comments: content = someone_comment['content'] # content id = someone_comment['comment_id'] # id publish_user_photo = someone_comment['passport'][ 'img_url'] # publish_user_photo publish_user = someone_comment['passport'][ 'nickname'] # publish_user publish_user_id = someone_comment['passport'][ 'user_id'] # publish_user_id create_time = someone_comment[ 'create_time'] # publish_time spider_time = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') parent_id = data['id'] ancestor_id = data['id'] comments = someone_comment['comments'] reply_count = someone_comment['reply_count'] like_count = someone_comment['support_count'] dislike_count = someone_comment['oppose_count'] if comments: parent_id = comments['comment_id'] thiscomments = { 'content': content, 'id': id, 'publish_user_photo': publish_user_photo, 'publish_user': publish_user, 'publish_user_id': publish_user_id, 'create_time': create_time, 'spider_time': spider_time, 'parent_id': parent_id, 'ancestor_id': ancestor_id, 'reply_count': reply_count, 'like_count': like_count, 'dislike_count': dislike_count } comments_data.append(thiscomments) data['reply_nodes'] = comments_data data['reply_count'] = reply_count_outside while len(self.result_list) > 600: time.sleep(1) print 'is waiting the lenth of the result_list to decrease to 300' self.result_list.append(data)
def get_content_inside( data ): # 在线程函数中这里使用心得session算了,线程安全,这里是获取的一页的信息,另一个名字相似的函数是获得下一页的content信息 url = data['url'] headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } response1 = get_response_and_text(url=url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] datasoup = BeautifulSoup(response_in_function_text, 'lxml') Re_find_isimgpage = re.compile(r'\<ul class\=\"piclist\"\>') to_charge_is_picture = Re_find_isimgpage.findall( response_in_function_text) if to_charge_is_picture: # 是图片模块,进入图片处理模块 print to_charge_is_picture, 'and is in deal_pucture and the url is ---', data[ 'url'] content_and_img_urls = get_content_inside_picture(datasoup) img_urls = content_and_img_urls['img_urls'] contentall = content_and_img_urls['content'] else: content = '' img_urls = [] for i in datasoup.select( 'body > div.scrollBox.mt10 > div.article > div.art_co.sau > p' ): content += i.text # 8-3添加图片抓取功能 Re_find_img_url = re.compile(r'src=".*?"/\>') content_part_data = datasoup.select('div.article') if content_part_data: data_find_by_re = Re_find_img_url.findall( str(content_part_data[0])) for url_img_re in data_find_by_re: img_urls.append(url_img_re.split('"')[1]) next_page_selector = datasoup.select( 'body > div.scrollBox.mt10 > div.article > div.mb10.mt5.fs14 > a.page-next.ml5' ) contentall = '' if next_page_selector: next_page_html = next_page_selector[0].get('href') if next_page_html and len(next_page_html) > 3: next_page_url = next_page_html next_url = 'http://m.xilu.com' + next_page_url data['url'] = next_url content_and_img_urls2 = get_content_inside_next_page({ 'content': content, 'nexturl': next_url, 'img_urls': img_urls }) contentall += content_and_img_urls2['content'] img_urls3 = [] for i in content_and_img_urls2['img_urls']: img_urls3.append(i) for i in img_urls3: img_urls.append(i) else: contentall = content publish_time = data['publish_time'] if publish_time == u'刚刚': publish_time = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') elif u'小时前' in publish_time: time_pass = int(publish_time.replace(u'小时前', '')) publish_time = ( datetime.datetime.now() - timedelta(hours=time_pass)).strftime('%Y-%m-%d %H:%M:%S') elif u'分钟前' in publish_time: time_pass = int(publish_time.replace(u'分钟前', '')) publish_time = ( datetime.datetime.now() - timedelta(minutes=time_pass)).strftime('%Y-%m-%d %H:%M:%S') elif '-' in publish_time and len(publish_time) == 5: publish_time = '2017-' + publish_time + ' 00:00:00' data['content'] = contentall data['img_urls'] = img_urls data['publish_time'] = publish_time data['url'] = url while len(self.comments_url_list) > 600: # 防止一个列表中的东西太多,太多了就等等 time.sleep(1) print 'is waiting the lenth of comments_urls_list to decrease to 300' self.comments_url_list.append(data)
def get_content_inside(data): #这里不设计去重功能就真的没法停下来了 #这里就写第一次的代码功能就行 url = data['url'] page_num = url.split('/')[-1] response1 = get_response_and_text( url=url, needupdate=True, update_info={'page_num': page_num}) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] Re_find_sid = re.compile(r'sid=".*"') try: datasoup = BeautifulSoup(response_in_function_text, 'lxml') except Exception as e: print e return if ('class="swiper-container"' not in response_in_function_text ) and ('class="content"' in response_in_function_text): #这个是文字类的新闻 sid = Re_find_sid.findall(response_in_function_text)[0].split( '"')[1] data['sid'] = sid datasoup = BeautifulSoup(response_in_function_text, 'lxml') for i in datasoup.select( 'body > div.content > div.neirong > h2'): title = i.text for j in datasoup.select( 'body > div.content > div.neirong > p > span:nth-of-type(4)' ): publish_time = j.text for k in datasoup.select( 'body > div.content > div.neirong > p > span:nth-of-type(3)' ): publish_user = k.text.replace(' ', '').replace( '\t', '').replace('\n', '').replace('\r', '').replace(u'来源:', '') content = '' for l in datasoup.select( 'body > div.content > div.neirong > article > p'): content += l.text img_urls = [] neirong_content = datasoup.select( 'body > div.content > div.neirong') neirong_content = str(neirong_content) Re_find_img_url = re.compile(r'src=".*?"') img_find_by_re = Re_find_img_url.findall(neirong_content) for i in img_find_by_re: img_urls.append(i.split('"')[1]) try: publish_time += ':00' except Exception as e: print e data['title'] = title data['content'] = content data['publish_time'] = publish_time data['publish_user'] = publish_user data['reply_nodes'] = [] data['spider_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data['img_urls'] = img_urls elif 'class="swiper-container"' in response_in_function_text: #这里可能是图片新闻 content = '' img_urls = [] for title_for in datasoup.select('body > div.content > h2'): title = title_for.text for publish_time_for in datasoup.select( 'body > div.content > p.jieshao > span:nth-of-type(4)' ): publish_time = publish_time_for.text + ':00' for publish_user_for in datasoup.select( 'body > div.content > p.jieshao > span:nth-of-type(3) > a' ): publish_user = publish_user_for.text.replace( ' ', '').replace('\t', '').replace('\n', '').replace( '\r', '').replace(u'来源:', '') for content_for in datasoup.select( 'body > div.content > p.zongjie'): content += content_for.text for img_url in datasoup.select( 'div.swiper-container > div.swiper-wrapper > div.swiper-slide > div.imgdiv > img' ): img_urls.append(img_url.get('src')) try: data['title'] = title data['content'] = content data['publish_time'] = publish_time data['publish_user'] = publish_user data['reply_nodes'] = [] data['spider_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data['img_urls'] = img_urls except Exception as e: print e return else: print url, '-----not in neirong and picture deal module' return while len(self.comments_url_list) > LEN_COMMENT_LIST: time.sleep(1) print data self.comments_url_list.append(data) pass
def get_comment_inside(data): #也是分为两段设计,第一次获得contentid topicid = None cmspage_taotalnum = 1 comments_data = [] cmspagenum = 1 #额外添加 request_num = 1 # comments_data=[] while True: # reply_count=0 if not topicid: comment_url_without_id = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cyrHnxhFx&page_size=30&hot_size=5&topic_source_id=' comment_url = comment_url_without_id + data['sid'] else: comment_url = 'http://changyan.sohu.com/api/2/topic/comments?client_id=cyrHnxhFx&page_size=30&topic_id=' + str( topicid) + '&page_no=' + str(request_num) response1 = get_response_and_text(url=comment_url) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] try: data_json = json.loads(response_in_function_text) except Exception as e: print e return if data_json['comments']: data_json_comments = data_json['comments'] cmspage_taotalnum = data_json['cmt_sum'] topicid = data_json['topic_id'] for someone_comment in data_json_comments: content = someone_comment['content'] # content id = someone_comment['comment_id'] # id publish_user_photo = someone_comment['passport'][ 'img_url'] # publish_user_photo publish_user = someone_comment['passport'][ 'nickname'] # publish_user publish_user_id = someone_comment['passport'][ 'user_id'] # publish_user_id create_time = someone_comment[ 'create_time'] # publish_time create_time = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(int(create_time / 1000)))) spider_time = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') like_count = someone_comment['support_count'] parent_id = data['id'] #mark这两个节点到底应该放什么东西呢? ancestor_id = data['id'] this_comments = someone_comment['comments'] if this_comments: parent_id = this_comments[0]['comment_id'] #用堆来解决这种类型的评论8-16 # for this_comments cmspagenum += 1 thiscomments = { 'content': content, 'id': id, 'publish_user_photo': publish_user_photo, 'publish_user': publish_user, 'publish_user_id': publish_user_id, 'publish_time': create_time, 'spider_time': spider_time, 'like_count': like_count, 'parent_id': parent_id, 'ancestor_id': ancestor_id, } comments_data.append(thiscomments) if cmspagenum >= cmspage_taotalnum - 1: break request_num += 1 data['reply_nodes'] = comments_data data['reply_count'] = cmspage_taotalnum while len(self.result_list) > 600: time.sleep(1) print 'is waiting the lenth of the result_list to decrease to 300' #最后处理,去掉不需要的字段: del data['sid'] self.result_list.append(data)
def get_comment_inside(data): #这里的下一页请求只需要在cmspagenum上加1即可。 #初始化 topicid = None cmspage_taotalnum = 0 comments_data = [] cmspagenum = 1 while True: headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } if not topicid: comment_url = 'https://apiv2.sohu.com/api/topic/load?page_size=10&topic_source_id=' + \ str(data['cmsid'])+'&page_no=10' else: comment_url = 'https://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=' + str( topicid) + '&page_no=' + str(cmspagenum) response1 = get_response_and_text(url=comment_url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] data_json = json.loads(response_in_function_text) if cmspagenum == 1: try: cmspage_taotalnum = data_json['jsonObject']['cmt_sum'] data['reply_count'] = cmspage_taotalnum except: try: cmspage_taotalnum = data_json['jsonObject'][ 'outer_cmt_sum'] except: cmspage_taotalnum = 0 #因为这里边没有返回这个值 for one_comment in data_json['jsonObject']['comments']: id = one_comment['comment_id'] content = one_comment['content'] url = response_in_function.url publish_time = one_comment['create_time'] publish_time = int(publish_time) / 1000 time_format = '%Y-%m-%d %H:%M:%S' publish_time_stamp_9 = time.localtime(float(publish_time)) publish_time = time.strftime(time_format, publish_time_stamp_9) publish_user_id = one_comment['user_id'] like_count = one_comment['support_count'] reply_count = one_comment['reply_count'] try: publish_user = one_comment['passport']['nickname'] except Exception as e: publish_user = '******' publish_user_photo = one_comment['passport']['img_url'] ancestor_id = data['id'] print ancestor_id, '---------', data['id'] print '---------------------', id, '----------------------' if ancestor_id != data['id']: print ancestor_id, '---------', data['id'] parent_id = data['id'] if one_comment['comments']: parent_id = one_comment['comments'][0]['comment_id'] thisnode = { 'id': id, 'content': content, 'url': url, 'publish_time': publish_time, 'publish_user_id': publish_user_id, 'like_count': like_count, 'reply_count': reply_count, 'publish_user': publish_user, 'publish_user_photo': publish_user_photo, 'ancestor_id': ancestor_id, 'parent_id': parent_id } comments_data.append(thisnode) cmspagenum += 1 if cmspagenum <= int( cmspage_taotalnum / 10) + 1: #既然每次10条结果,那么 if not topicid: topicid = data_json['jsonObject']['topic_id'] # get_comment_inside(data,cmspagenum,comments_data,topicid,cmspage_taotalnum) else: data['reply_nodes'] = comments_data del data['cmsid'] #删除为获取评论而生成的id self.result_list.append(data) break