def get_content_inside(data): url_debug = data['url'] response1 = get_response_and_text(url_debug) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] try: datasoup = BeautifulSoup(response_in_function_text, 'lxml') except Exception as e: return img_list = [] #会报错,从try中提出来 result_read_count = 0 video_urls = [] try: content_raw = datasoup.select('#artContent') content = content_raw[0].text.strip() img_list2 = Re_find_img_url.findall(str(content_raw[0])) for img_url_raw in img_list2: if img_url_raw not in [ 'http://image21.360doc.com/DownloadImg/2010/12/2413/7923021_1.gif' ]: if 'swf' not in img_url_raw: img_list.append(img_url_raw) else: video_urls.append(img_url_raw) except Exception as e: # print e content = '' try: url_debug2 = 'http://webservice.360doc.com/GetArtInfo20130912NewV.ashx?UID=-100,' + data[ 'publish_user_id'] + ',GetBookTwo,' + data[ 'id'] + ',0,0@cg@0&jsoncallback=jsonp' response2 = get_response_and_text(url_debug2) response_in_function_text2 = response2[ 'response_in_function_text'] result_read_count = response_in_function_text2.split( u'@c@g@tl@c@g@t')[1].split(u'l@c@g@t')[0] except Exception as e: # print e pass data['content'] = content data['read_count'] = int(result_read_count) data['img_urls'] = img_list data['video_urls'] = video_urls self.comments_data_list.put(data)
def get_index2(index_queue): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } index_page_1_url = 'http://www.ibeitun.net/xinxi/s0_a0_m0_p1.html' response1 = get_response_and_text(url=index_page_1_url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] if not response_in_function: return datasoup = BeautifulSoup(response_in_function_text, 'lxml') url_list_div = datasoup.select('div.indexMessBox') Re_find_all_url = re.compile(r'\<a href\=\"(\/\d{4,5}.html)"') url_list = Re_find_all_url.findall(str(url_list_div)) for i in url_list: url = 'http://www.ibeitun.net' + str(i) index_queue.put({ 'url': url, 'spider_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'id': url.split('/')[-1].split('.')[0] })
def get_index(url,content_queue): headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', } response1=get_response_and_text(url=url,headers=headers) response_in_function=response1['response_in_function'] response_in_function_text=response1['response_in_function_text'] try: datasoup=BeautifulSoup(response_in_function_text,'lxml') except Exception as e: return for i in datasoup.select('body > div.content > div.shishiimportantnews > div.left > ul > li'): publish_time= i.select('span')[0].text url= i.select('a')[0].get('href') title= i.select('a')[0].get('title') datadict={ 'publish_time':publish_time, 'url':url, 'title':title, 'id':url.split('id=')[0], } content_queue.put(datadict)
def get_index_inside_wenben(url): user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36' headers = { 'User-Agent': user_agent } response1 = get_response_and_text(url=url,headers=headers) response_in_function=response1['response_in_function'] response_in_function_text=response1['response_in_function_text'] if not response_in_function: return datasoup = BeautifulSoup(response_in_function_text, 'lxml') for div_content in datasoup.select('body > div'): try: try: reply_count=div_content.select('span.reply')[0].text except Exception as e: # print e pass reply_count=0#因为有些新闻index请求中看不到评论消息。 url= 'http://m.thepaper.cn/' + div_content.select('div > a')[0].get('href') # url publish_time = div_content.select('p > span')[0].text # publish_time #这里需要对publish_time做处理吗? title= div_content.select('div > p > a')[1].text # title publish_user= div_content.select('div > p > a')[0].text # publish_user # print div_content if u'分钟' in publish_time: minulate = publish_time.replace(u'分钟前', '') time_b = datetime.now() - timedelta(minutes=int(minulate)) print time_b time_c = time_b.strftime('%Y-%m-%d %H:%M:%S') publish_time= time_c elif u'小时前' in publish_time: hourse = publish_time.replace(u'小时前', '') time_b = datetime.now() - timedelta(hours=int(hourse)) time_c = time_b.strftime('%Y-%m-%d %H:%M:%S') publish_time= time_c elif u'天前' in publish_time: days = publish_time.replace(u'天前', '') time_b = datetime.now() - timedelta(days=int(days)) time_c = time_b.strftime('%Y-%m-%d %H:%M:%S') publish_time= time_c print '\n\n\n' except Exception as e: # print e pass id=url.split('_')[-1] this_dict={ 'id':id, 'url':url, 'publish_time':publish_time, 'title':title, 'publish_user':publish_user, 'is_movie':False, 'reply_count':reply_count, 'spider_time':datetime.now().strftime('%Y-%m-%d %H:%M:%S') } self.content_data_list.append(this_dict)
def get_content(data, result_queue): try: url = data['url'] headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', } if 'nssbt' not in url: return response1 = get_response_and_text(url=url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] if not response_in_function: return datasoup = BeautifulSoup(response_in_function_text, 'lxml') # title= datasoup.select('body > div.content > div.shishiimportantnews > div.left > div.bingtuan > div.title')[0].text try: publish_user = datasoup.select( 'body > div.content > div.shishiimportantnews > div.left > div.bingtuan > div.bingtuanxinxi' )[0].text except Exception as e: print url try: publish_user.split(u'作者:')[1].split(' ')[0] except Exception as e: print e try: print publish_user except: pass source = datasoup.select( 'body > div.content > div.shishiimportantnews > div.left > div.bingtuan > div.bingtuanxinxi > a' )[0].text content = '' for content_p in datasoup.select( 'body > div.content > div.shishiimportantnews > div.left > div.bingtuan > div.networkinformation > p' ): content += content_p.text data['publish_user'] = publish_user data['source'] = source data['content'] = content result_queue.put(data) # print data # print 'send to result_queue one!!!!!!!' except Exception as e: # traceback.extract_stack() print e
def get_index(contentqueue): headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } urls=['http://bts.gov.cn/xw/gjgn/']#国内 url2=['http://bts.gov.cn/xw/zdxw/']#师事 url3=['http://www.bts.gov.cn/zcms/']#部门动态 url4=['http://bts.gov.cn/xw/zsjg/']#直属单位 url5=['http://bts.gov.cn/xw/gjgn/'] url6=['http://bts.gov.cn/xw/qt/']#其它 url7=['http://www.bts.gov.cn/gk/tzgg/']#通知公告 url9=['http://www.bts.gov.cn/gk/rsxx/']#人事信息 url10=['http://www.bts.gov.cn/gk/ywgz/']#业务工作 url11=['http://www.bts.gov.cn/gk/wjzc/']#文件政策 url12=['http://www.bts.gov.cn/gk/zcjd1/']#政策解读 url13=['http://www.bts.gov.cn/gk/tjxx/']#统计信息 urls_all=urls+url2+url3+url4+url5+url6+url7+url9+url10+url11+url12+url13 for one_url in urls_all: response1=get_response_and_text(url=one_url,headers=headers,charset='utf-8') response_in_fucntion=response1['response_in_function'] if not response_in_fucntion: continue response_in_fucntion_text=response1['response_in_function_text'] datasoup=BeautifulSoup(response_in_fucntion_text,'lxml') try: for one_li in datasoup.select( 'body > div.mainBg > div.listMain.pageWidth.clearself > div.ListRightContent.clearself > div.docuContent.listul > ul > li'): # print one_li.text url_raw = one_li.select('a')[0].get('href') title = one_li.select('a')[0].text.strip() url_end = urljoin(basic_url, url_raw) id=url_end.split('/')[-1].split('.')[0] if 'bts.gov.cn' in url_end: print url_end print title # print one_li.select('a')[1].text#publish_time//2017-04-04 index_dict={ 'title':title, 'url':url_end, 'id':id, 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'parent_id':id, } contentqueue.put(index_dict) except Exception as e: print e print one_url
def get_index(queue): headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } urls1=['http://www.altxw.com/news/system/count//0012002/000000000000/000/001/c0012002000000000000_0000012{}.shtml'.format(str(i)) for i in range(55,58)]# urls2=['http://www.altxw.com/news/node_2031.htm'] urls3=urls1+urls2 urls3.append('http://www.altxw.com/gblw/index.shtml') for url in urls3: print url response1=get_response_and_text(url=url,headers=headers,charset='utf-8') if not response1['response_in_function']: continue response_in_function_text=response1['response_in_function_text'] datasoup=BeautifulSoup(response_in_function_text,'lxml') for one_url_div in datasoup.select('div.bd > ul li'): url= one_url_div.select('a')[0].get('href') if 'com' not in url: url='http://www.altxw.com/news/'+url if 'altxw.com/news/' not in url: continue title= one_url_div.select('a')[0].text.strip() # publish_time= '20'+one_url_div.select('span')[0].text.strip()+':00' publish_time=one_url_div.select('span')[0].text.strip() if len(publish_time.split('-')[0])<4: publish_time='20'+publish_time if len(publish_time)<11: publish_time+=' 00:00:00' elif 4<len(publish_time.split(u' ')[1])<8: publish_time=publish_time+':00' if len(publish_time)<18: print publish_time spider_time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') id=url.split('/')[-1].split('.')[0] index_data_dict={ 'url':url, 'title':title, 'publish_time':publish_time, 'spider_time':spider_time, 'id':id, 'parent_id':id, 'publish_user':'' } queue.put(index_data_dict)
def get_content(data, result_queue): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } url = data['url'] response1 = get_response_and_text(url=url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] if not response_in_function: return datasoup = BeautifulSoup(response_in_function_text, 'lxml') # print str(datasoup) try: title = datasoup.select('div.main div.infobox h2.bt1')[0].text #title publish_time = datasoup.select('div.main div.infobox div.infoDate')[ 0].text.split('\n')[0].split(u':')[1].strip() #publish_time read_count = datasoup.select('div.main div.infobox div.infoDate')[ 0].text.split('\n')[1].split(u':')[1].strip() #read_count content = datasoup.select( '#infobox > div.infoLeft > div.infoContent div.textwrap')[0].text img_urls = [] for picurl in datasoup.select( '#infobox > div.infoLeft > div.infoContent div.picwrap a'): img_urls.append('http://www.ibeitun.net' + picurl.get('href')) # news_content={ # 'title':title, # 'publish_time':publish_time, # 'read_count':read_count, # 'content':content, # 'img_urls':img_urls # } data['content'] = content data['title'] = title data['read_count'] = read_count data['img_urls'] = img_urls data['publish_time'] = str(publish_time) + ' 00:00:00' result_queue.put(data) except Exception as e: # print 'the error pages url is ------>',url # traceback.print_exc() return
def get_content(data, comment_queue): Re_find_img = re.compile(r'src\=\"(.*?)\"') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } basic_url = 'http://www.altxw.com/news/content/' url = data['url'] response1 = get_response_and_text(url=url, headers=headers, charset='utf-8') response_in_function = response1['response_in_function'] if not response_in_function: return try: response_in_function_text = response1['response_in_function_text'] datasoup = BeautifulSoup(response_in_function_text, 'lxml') # title=datasoup.select('body > div.body > div > div.main.l > div > div > h1')[0].text() source = datasoup.select( 'body > div.body > div > div.main.l > div > div > div > li:nth-of-type(2)' )[0].text.split(u':')[1] content = '' for i in datasoup.select( 'body > div.body > div > div.main.l > div > div > ul > p'): content += i.text content_div = datasoup.select('div > div.main.l > div > div > ul')[0] img_urls = Re_find_img.findall(str(content_div)) img_urls2 = [] for one_img_url in img_urls: one_img_url = urljoin(basic_url, one_img_url.strip('../')) img_urls2.append(one_img_url) print img_urls2 data['source'] = source data['content'] = content data['img_urls'] = img_urls2 comment_queue.put(data) except Exception as e: print e
def get_content_inside_movie(data): url_for_debug=data['url'] response1=get_response_and_text(url=url_for_debug) response_in_function=response1['response_in_function'] response_in_function_text=response1['response_in_function_text'] if not response_in_function: return datasoup=BeautifulSoup(response_in_function_text,'lxml') Re_find_content = re.compile(r'desc: \'(.*)\'') content_data=Re_find_content.findall(response_in_function_text) #8-30 like_count=datasoup.select('#news_praise') if like_count: like_count_value=int(like_count[0].text.strip()) else: like_count_value=0 vedio=datasoup.select('video > source') if vedio: vedio_urls=[] for vedio1 in vedio: vedio_urls.append(vedio1.get('src')) else: vedio_urls=[] #8-30 try: content= content_data[0] except Exception as e: print e#这里有时候会报错说这里的content没有内容 content='' try: source=datasoup.select('#v3cont_id > div.news_content > div > br')[0].text.split(u'来源:')[1] except: source='' # publish_time= datasoup.select('#v3cont_id > div.news_content > div:nth-of-type(3)')[0][0:16] data['content']=content data['like_count']=like_count_value data['video_urls']=vedio_urls data['source']=source self.comments_url_list.append(data)
def get_content(data, result_queue): # Re_find_img = re.compile(r'img .*? src="(.*?)"') Re_find_movie = re.compile(r'\<video.*?src="(.*?)"') url = data['url'] headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', } response1 = get_response_and_text(url=url, headers=headers) response_in_function = response1['response_in_function'] if not response_in_function: return response_in_function_text = response1['response_in_function_text'] try: datasoup = BeautifulSoup(response_in_function_text, 'lxml') except: return content_str = str(datasoup.select('div.content_main')[0]) img_urls = Re_find_img.findall(content_str) video_url = Re_find_movie.findall(content_str) img_urls2 = [] video_url2 = [] for one_img_url in img_urls: img_urls2.append('http://www.xjbtssbtszhdj.com/' + one_img_url) for one_video_url in video_url: video_url2.append('http://www.xjbtssbtszhdj.com' + one_video_url) # print img_urls2 # print video_url2 data['img_urls'] = img_urls2 data['video_urls'] = video_url2 result_queue.put(data)
def get_index(content_queue): # url=data['url'] outside_url = [ 'http://186t.ibeitun.net/news.aspx?s=0&p={}'.format(str(i)) for i in range(1, 5) ] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } for url in outside_url: response1 = get_response_and_text(url=url, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] # response1=requests.get(url=url,headers=headers) # response_in_function_text=response1.text datasoup = BeautifulSoup(response_in_function_text, 'lxml') for one_title in datasoup.select( 'div.main.fixed > div.right.wow.fadeInUp > ul > li'): url = 'http://186t.ibeitun.net' + one_title.select('a')[0].get( 'href') publish_time = one_title.select( 'a span')[0].text.strip() + ' 00:00:00' title = one_title.select('a div')[0].text.strip() one_dict = { 'url': url, 'publish_time': publish_time, 'title': title, 'id': url.split('mid=')[1], 'spider_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } content_queue.put(one_dict)
def get_content_inside(data): url = data['url'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } needproxy = int(random.randint(1, 10) / 7) response1 = get_response_and_text(url=url, headers=headers, needproxy=needproxy) response_in_function = response1['response_in_function'] response_in_function_text = response1['response_in_function_text'] if not response_in_function: return real_url = response_in_function.url if 'toutiao' not in real_url: # logger_toutiao.log(level=logging.WARNING, msg='toutiao was not in thisurl---------' + real_url) return elif 'http://www.toutiao.com/api/pc/subject/' in real_url: # logger_toutiao.log(level=logging.WARNING,msg='http://www.toutiao.com/api/pc/subject/ was in thisurl----------'+real_url) return else: url = real_url Re_find_chineseTag = re.compile(r"chineseTag: '.*?'") ####################################################### chineseTag = Re_find_chineseTag.findall(response_in_function_text) if chineseTag: try: # print 'the lenth of response-------',len(response_in_function_text) chineseTag = chineseTag[0].split("'")[1] if chineseTag == u'图片' or chineseTag == '图片': content_time_img = get_content_picture({ 'response_in_function': response_in_function, 'response_in_function_text': response_in_function_text }) elif chineseTag == u'问答' or chineseTag == '问答': content_time_img = get_content_wenda(htmldata={ 'response_in_function': response_in_function, 'response_in_function_text': response_in_function_text, 'data': data }, data=data) return else: # print chineseTag,'is gonging to get_content_news' content_time_img = get_content_news({ 'response_in_function': response_in_function, 'response_in_function_text': response_in_function_text }) except Exception as e: # print e, '在区分是属于图片、问答等模块时出错' pass else: # print chineseTag return #如果不是问答,那么就进入到这里边 Re_find_itmeId = re.compile(r'itemId: \'.*?\'') # 普通头条 Re_find_itme_Id = re.compile(r'item_id:\'.*?\'') # 图片 if Re_find_itmeId.findall(response_in_function_text): try: item_id = Re_find_itmeId.findall( response_in_function_text)[0].split("'")[1] except Exception as e: # logger_toutiao.log(level=logging.WARNING, msg={'where': 'itemid来split失败了', 'contetn': # Re_find_itmeId.findall(response_in_function_text)[0]}) # print e, 'itemid在re中找到了,但是split失败了' pass else: try: item_id = Re_find_itme_Id.findall( response_in_function_text)[0].split("'")[1] except Exception as e: pass # print e, '在item——id中没找到值,图片的item_id' # msg = {'errormsg': e.message + '在item——id中没找到值,图片的item_id', # 'htmldata': response_in_function_text, # 'url': response_in_function.url, # 'code': response_in_function.code, # 'msg': response_in_function.msg} # logger_toutiao.log(level=logging.WARNING, msg=msg) return try: data['img_urls'] = content_time_img['img_urls'] data['content'] = content_time_img['content'] if len(content_time_img['publish_time']) < 12: data['publish_time'] = content_time_img[ 'publish_time'] + ' 00:00:00' else: data['publish_time'] = content_time_img['publish_time'] data['item_id'] = item_id data['reply_nodes'] = [] except Exception as e: # print e, 'data合成的时候除了问题' pass self.comments_url_list.append(data)
def get_content_inside(data): url_for_debug = data['url'] is_first = 1 reply_nodes = [] error_times = 5 while True: response1 = get_response_and_text(url=url_for_debug, headers=self.headers, charset='utf-8') response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] if not response_in_function: #10-9,因为返回经常会有空的,网络不好。 return # response_in_function_text=response_in_function_text.decode('utf-8').encode('utf-8') img_may_no_user = [ 'http://image.gfan.com/static/image/common/rright.gif', 'http://image.gfan.com/static/image/common/none.gif', 'http://image.gfan.com/static/image/common/rleft.gif' ] result_text = handle_content(response_in_function_text) Re_find_img_url = re.compile(r'file="(.*?)"') datasoup = BeautifulSoup(result_text, 'lxml') if is_first == 1: try: main_div = datasoup.select('table[id]')[0] main_content = datasoup.select( '.plc .pct .pcb')[0].text.strip() main_content_div = datasoup.select('.plc .pct .pcb')[0] main_read_count = datasoup.select( ' tr td.pls.ptm.pbm > div > span:nth-of-type(2)' )[0].text main_reply_count = datasoup.select( 'tr > td.pls.ptm.pbm > div > span:nth-of-type(5)' )[0].text main_img_urls = Re_find_img_url.findall( str(main_content_div)) main_img_urls_list2 = [] for img_url_raw in main_img_urls: img_url_dealed1 = img_url_raw.replace( '.thumb.jpg', '') main_img_urls_list2.append(img_url_dealed1) except Exception as e: # print e return try: main_publish_user_photo = main_div.select( 'div.avatar a img')[0].get('src') #有可能被删除了 except Exception as e: main_publish_user_photo = '' main_publish_user = main_div.select( '.pls .pi .authi a')[0].text data['read_count'] = main_read_count data['img_urls'] = list(set(main_img_urls_list2)) data['reply_count'] = main_reply_count data['content'] = main_content data['publish_user_photo'] = main_publish_user_photo data['publish_user'] = main_publish_user #9-19 try: datasoup.select('.plc .pct .pcb')[0].text.strip() except Exception as e: # print e #因为网络原因,导致下一页数据可能获取不完全 error_times -= 1 if error_times > 1: continue else: return for one_div in datasoup.select( '#postlist > div[id]')[is_first:-1]: img_list = [] # print one_div.select('div.authi a')[0].text.strip()#publish_user # print one_div.text.strip() try: maybe_url_list = Re_find_img_url.findall( str(one_div.select('.plc .pct .pcb')[0])) for url_img_one in maybe_url_list: if url_img_one not in img_may_no_user and 'http://bit.ly/' not in url_img_one: img_list.append(url_img_one) img_urls = img_list # img_list content = one_div.select('.plc .pct .pcb')[ 0].text.strip() # content#这里有时候不同的网页内部内容是不一样的 publish_time = one_div.select( '.authi em')[0].text.replace(u'发表于 ', '').replace( '\n', '').strip() + ':00' # publish_time try: publish_user_photo = one_div.select( 'div.avatar a img' )[0].get( 'src' ) # publish_user_photo#因为会有用户删除这种情况,导致不能正常获取对应的图片 except Exception as e: # print e publish_user_photo = '' # if data['publish_user_photo']=='None':#功能有冲突不过也能用 # data['publish_user_photo']=publish_user_photo # else: # data['publish_user_photo']='' url = one_div.select('.plc .pi strong a')[0].get( 'href').replace(';', '&') # url id = one_div.select('.plc .pi strong a')[0].get( 'id') # id publish_user_id = one_div.select( '.plc .pi strong a')[0].get('id').replace( 'postnum', '') # publish_user_id publish_user = one_div.select( '.pls .pi .authi a')[0].text this_comment_info = { 'img_urls': img_urls, 'content': content, 'publish_time': time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime(publish_time, '%Y-%m-%d %H:%M:%S')), 'publish_user_photo': publish_user_photo, 'url': url, 'id': id, 'publish_user_id': publish_user_id, 'parent_id': data['id'], 'ancestor_id': data['id'], 'publish_user': publish_user } reply_nodes.append(this_comment_info) # print one_div.select('.t_fsz')[0].text except Exception as e: # print e, '这里楼层数据被删除了' pass next_page_url_raw = datasoup.select('.nxt') if next_page_url_raw: is_first = 0 next_url = next_page_url_raw[0] url_for_debug = next_url.get('href') # print 'is going to deal next page-------------',url_for_debug else: data['reply_nodes'] = reply_nodes self.result_data_list.append(data) break
def get_index_inside(url_get_index): next_page_num = 0 next_page_num_error = 0 #因为网络原因会导致下一页判断出错,这个变量来设置重访次数 while True: response1 = get_response_and_text(url=url_get_index, headers=self.headers) respnse_in_function = response1['response_in_function'] respnse_in_function_text = response1[ 'response_in_function_text'] if not respnse_in_function: continue try: datasoup = BeautifulSoup(respnse_in_function_text, 'lxml') except Exception as e: print e for one_forum in datasoup.select('#moderate tbody')[1:]: try: title = one_forum.select('th > a')[0].text # title publish_user = one_forum.select( 'td.by a')[0].text # publish_user # print one_forum.select('td.by a')[0].get('href') # publish_user_url publish_time = one_forum.select('td.by em span')[ 0].text.strip() + ':00' # publish_time reply_count = one_forum.select( 'td.num a')[0].text # reply_count read_count = one_forum.select( 'td.num em')[0].text # view_num url = one_forum.select('th > a')[0].get('href') # url time_secends = time.strptime(publish_time, '%Y-%m-%d %H:%M:%S') this_reply_node = { 'title': title, 'publish_user': publish_user, 'publish_time': time.strftime('%Y-%m-%d %H:%M:%S', time_secends), 'reply_count': reply_count, 'read_count': read_count, 'url': url, # 'publish_user':None, 'id': url.split('-')[1], 'spider_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'publish_user_photo': 'None', 'content': 'None' } self.content_data_list.append(this_reply_node) next_page_num_error += 3 #既然这里都有列表了,说明网页访问没有问题,就给这个值设置大一些,免得最后一页重复访问太多次。 except Exception as e: pass # print e next_page_url = datasoup.select('.nxt') if next_page_url: next_page_num += 1 next_url = next_page_url[0].get('href') url_get_index = next_url next_page_num_error = 0 #这里一定要重置 else: #为什么这里会提前停止?很有可能是下一页的数据获取的时候网页因为网络原因导致没有解析出下一页,所以这里改成请求次数的这种 if next_page_num_error > 5: break else: # print respnse_in_function.status_code next_page_num_error += 1
def get_Index(self): while True: for url_to_get_index in self.urls: for i in range(10): try: needproxy = int(random.randint(1, 10) / 7) response1 = get_response_and_text(url=url_to_get_index, needproxy=needproxy) response_in_function = response1[ 'response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] response_text = response_in_function_text.decode( 'utf-8') datajson = json.loads(response_text) datajson_index_data = datajson['data'] for one_index in datajson_index_data: try: title = one_index['title'] except: title = '' try: reply_count = int(one_index['comments_count']) except: reply_count = 0 url = 'http://www.toutiao.com' + one_index[ 'source_url'] try: publish_user = one_index['source'] # publisher except: publish_user = '' try: publish_user_photo = one_index[ 'media_avatar_url'] if 'http' not in publish_user_photo: publish_user_photo = 'http:' + publish_user_photo except: publish_user_photo = '' try: vedio_id = one_index['video_id'] except: vedio_id = None try: is_ad = one_index['label'] except: is_ad = False if vedio_id: continue # 如果是视频,直接舍弃 if is_ad == u'广告': continue id = one_index['group_id'] dict1 = { 'id': id, 'url': url, 'reply_count': reply_count, 'title': title, 'publish_user': publish_user, 'publish_user_photo': publish_user_photo, 'spider_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') } self.content_data_list.append(dict1) except Exception as e: pass print '歇一会,现在在等待那600秒' time.sleep(600)
def get_comment_comment(data1): # 评论中有评论,起名data1是为了防止覆盖data变量 id = data1['id'] error_time = 3 while True: headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1', # 'Upgrade-Insecure-Requests':'1', # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Accept-Encoding':'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', # 'Cache-Control':'max-age=0', 'Connection': 'close' } while True: try: comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str( id) + '&item_id=' + str(id) + '&offset=0&count=20' response1 = get_response_and_text(url=comment_url, headers=headers) response_in_function = response1[ 'response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] datajson = json.loads(response_in_function_text) break except Exception as e: # print e,'mark2' error_time -= 1 if error_time < 1: return reply_nodes = [] # datajson=json.loads(response_in_function.text) try: datajson = json.loads( response_in_function_text ) #报错 ValueError: No JSON object could be decoded 8-28日错误很多 except Exception as e: # print e pass try: datajson['data'][ 'data'] #sometimes this will be wrong! the response returned is not what you need!9-7 except Exception as e: # print e error_time -= 1 if error_time < 1: # print 'wrong time too much' break continue for one_comment in datajson['data']['data']: content = one_comment['text'] like_count = one_comment['digg_count'] publish_time = one_comment['create_time'] publish_time = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time))) publish_user_id = one_comment['user']['user_id'] publish_user = one_comment['user']['screen_name'] publish_user_photo = one_comment['user']['avatar_url'] id = one_comment['id'] try: ancestor_id = data1['ancestor_id'] except Exception as e: # print e,'mark3' ancestor_id = 'wrong' parent_id = data1['id'] thisnode = { 'publish_user': publish_user, 'content': content, 'like_count': like_count, 'publish_time': publish_time, 'publish_user_id': publish_user_id, 'publish_user_photo': publish_user_photo, 'id': id, 'ancestor_id': ancestor_id, 'parent_id': parent_id, # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } reply_nodes.append(thisnode) return reply_nodes
def get_comment_inside(data): # session1 = requests.session() headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } while True: # 强制请求 try: # print data comment_url = 'http://www.toutiao.com/api/comment/list/?group_id=' + str( data['id']) + '&item_id=' + str( data['item_id']) + '&offset=0&count=20' needproxy = int(random.randint(1, 10) / 7) response1 = get_response_and_text(url=comment_url, needproxy=needproxy) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] break except Exception as e: # print e,'mark1' if 'item_id' in e: messege = {'msg': e.message} # logger_toutiao.log(msg=messege, level=logging.WARNING) comments_data = [] try: data_json = json.loads(response_in_function_text) data_json['data']['comments'] except Exception as e: # print e,'mark1'#这里本来是应该返回正常的json数据,但是会返回一抹莫名奇妙的location跳转的网站。因此直接把它结束了,宁愿没抓,也不要误抓。 return for one_comment in data_json['data']['comments']: content = one_comment['text'] like_count = one_comment['digg_count'] publish_time = one_comment['create_time'] publish_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time))) publish_user_photo = one_comment['user']['avatar_url'] publish_user_id = one_comment['user']['user_id'] publish_user = one_comment['user']['name'] #8-17日改 id = one_comment['id'] reply_count = one_comment['reply_count'] parent_id = data['id'] ancestor_id = data['id'] if reply_count > 0: reply_nodes = get_comment_comment({ 'id': id, 'ancestor_id': data['id'] }) else: reply_nodes = [] thisnode = { 'content': content, 'like_count': like_count, 'publish_time': publish_time, 'publish_user_photo': publish_user_photo, 'publish_user_id': publish_user_id, 'publish_user': publish_user, 'id': id, 'reply_count': reply_count, 'reply_nodes': reply_nodes, 'parent_id': parent_id, 'ancestor_id': ancestor_id, # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } # data['reply_nodes'].append(thisnode) comments_data.append(thisnode) # 这里的评论能获取的就20个,所以不设计下一页,评论中的评论也是不设计下一页 data['reply_nodes'] = comments_data while len(self.result_list) > 600: time.sleep(1) print 'result_list 的长度低于300了,等待输入存储中。。。' self.result_list.append(data)
def get_content_in_wenda_comments_more(id_replynodes, data=None): error_time = 5 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } try: if not id_replynodes['next_comment_url']: url_comments_more = 'https://www.wukong.com/wenda/web/question/loadmorev1/?count=10&qid=' + \ id_replynodes['id'] + '&offset=10&req_type=1' response1 = get_response_and_text(url=url_comments_more, headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] else: response1 = get_response_and_text( url=id_replynodes['next_comment_url'], headers=headers) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] except Exception as e: print e datajson = json.loads(response_in_function_text) for one_comment in datajson['data']['ans_list']: datasoup_content = BeautifulSoup(one_comment['content'], 'lxml') content = datasoup_content.text img_urls = [] Re_find_img = re.compile(r'src=".*?"') img_urls_find_by_re = Re_find_img.findall( one_comment['content']) for img_url in img_urls_find_by_re: img_url_split = img_url.split('"')[1] img_urls.append(img_url_split) like_count = one_comment['digg_count'] id = one_comment['ansid'] publish_time = one_comment['create_time'] # 时间戳mark publish_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time))) reply_count = one_comment['comment_count'] publish_user_photo = one_comment['user']['avatar_url'] publish_user = one_comment['user']['uname'] publish_user_id = one_comment['user']['user_id'] try: reply_nodes = get_content_in_wenda_comments_comments({ 'id': id, 'reply_nodes': [], 'next_comment_url': None }) except Exception as e: # print e reply_nodes = [] parent_id = id_replynodes['id'] ancestor_id = data['id'] try: this_node = { 'publish_time': publish_time, 'content': content, 'like_count': like_count, 'id': id, 'reply_count': reply_count, 'publish_user_photo': publish_user_photo, 'publish_user': publish_user, 'publish_user_id': publish_user_id, 'reply_nodes': reply_nodes, 'ancestor_id': ancestor_id, 'parent_id': parent_id, # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } id_replynodes['reply_nodes'].append(this_node) except Exception as e: print e if datajson['data']['has_more']: url_offset = response_in_function.url.split('&offset=') offset = int(url_offset[1].split('&')[0]) + 10 url = url_offset[0] + '&offset=' + str(offset) id_replynodes['next_comment_url'] = url reply_nodes2 = get_content_in_wenda_comments_more( id_replynodes) return reply_nodes2 else: return id_replynodes['reply_nodes']
def get_comment_inside(data):#这种写法可能有问题 data['source']=data['source'].strip() isFirst_req = True start_id = 0 comments_list = [] while True: if isFirst_req==True: comment_req='http://www.thepaper.cn/load_moreFloorComment.jsp?contid='+data['id'] else: comment_req='http://www.thepaper.cn/load_moreFloorComment.jsp?contid='+data['id']+'&startId='+start_id headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' } response1=get_response_and_text(url=comment_req,headers=headers) response_in_function=response1['response_in_function'] response_in_function_text=response1['response_in_function_text'] if not response_in_function: return Re_find_startid = re.compile(r'startId="(.*?)"') data_re = Re_find_startid.findall(response_in_function_text) start_id=0#8-24日添加 try: start_id=data_re[0] except Exception as e: # print e pass datasoup=BeautifulSoup(response_in_function_text,'lxml') for one_div in datasoup.select('div.comment_que'): #手机端和电脑端的显示页面不一样,需要分别处理。 try: publish_user_photo=one_div.select('div.aqwleft > div > a > img')[0].get('src') except Exception as e: # print e pass publish_user_photo='http://www.thepaper.cn/img/headerimg_bg50.png'#可能没有,先舍弃了,后边有问题再回来检擦 mark1 publish_user=one_div.select('div.aqwright > h3 > a')[0].text try: id_1=str(one_div.select('div.aqwright > h3 > a')[0]).split('commentId=') # id=one_div.select('div.aqwright > h3 > a')[0].split('commentId=')[1] id=id_1[1].split('"')[0] except Exception as e: print e publish_user_id=str(one_div.select('div.aqwright > h3 > a')[0]).split('userId=')[1].split('&')[0] publish_time=one_div.select('div.aqwright > h3 > span')[0].text content='' for content_in_for in one_div.select('div > a[href^="javascript:replyFloor"]'):#有问题 if u'回复' not in content_in_for: content+=content_in_for.text # content=content.replace(u'回复',u'') try: like_count=int(one_div.select('div.aqwright > div.ansright_time > a[href^="javascript:priseCommtFloor"]')[0].text) except: like_count=0 if u'小时前' in publish_time: publish_time_num=int(publish_time.replace(u'小时前','')) publish_time=(datetime.now()-timedelta(hours=publish_time_num)).strftime('%Y-%m-%d %H:%M:%S') elif u'天前' in publish_time: publish_time_num=int(publish_time.replace(u'天前','')) publish_time=(datetime.now()-timedelta(days=publish_time_num)).strftime('%Y-%m-%d %H:%M:%S') elif u'分钟前' in publish_time: publish_time_num=int(publish_time.replace(u'分钟前','')) publish_time=(datetime.now()-timedelta(minutes=publish_time_num)).strftime('%Y-%m-%d %H:%M:%S') else: publish_time=publish_time #8-25日添加parentid处理模块 Re_find_publish_user = re.compile(ur'回复@(.*)\:') has_at_re=Re_find_publish_user.match(content) has_at='' if has_at_re: has_at=has_at_re.group(1) thisdata={ 'publish_user_photo':publish_user_photo, 'publish_user':publish_user, 'id':id, 'publish_user_id':publish_user_id, 'publish_time':publish_time, 'content':content, 'like_count':like_count, 'ancestor_id':data['id'], 'parent_id':data['id'],#这一个暂且这么设计,之后统计content里边有没有@到的人,之后再做统计 # 'reply_nodes':[], # 'has_at':has_at } comments_list.append(thisdata) if int(start_id)==0: #8-25添加parent_id处理功能 # comments_list2=comments_list[:] # for comment_one_data in comments_list2: # if comment_one_data['has_at']: # def merge_comment(comment_one_data): # #有@,就根据@后的人名来统计parent_id # for num in range(len(comments_list2)): # if comments_list2[num]['publish_user']==comment_one_data['publish_user']: # _=copy.deepcopy(comment_one_data) # del(comment_one_data['has_at']) # comments_list2[num]['reply_nodes'].append(comment_one_data) # comments_list2.remove(_)#这样结构依然不完整 # if comments_list2[num]['has_at']:#这里注意顺序!!!!python语法就是list都是调用不是开辟新的空间 # new_child_comment=copy.deepcopy(comments_list2[num]) data['reply_nodes']=comments_list data['reply_count']=len(comments_list) try: data['publish_time']=data['publish_time'].replace(u' ',u'').encode('utf-8') data['publish_time']=data['publish_time'].split(' ')[0]+' '+data['publish_time'].split(' ')[1] except Exception as e: # print e pass self.result_list.append(data) break
def get_Index(self): # while True: thisurls_list=[] for url in self.urls: response1=get_response_and_text(url=url,needproxy=False) response_in_function=response1['response_in_function'] response_in_function_text=response1['response_in_function_text'] if not response_in_function_text: return Re_pattern = re.compile(r'data : \"(.*?)\".*?Math\.random\b') datare = Re_pattern.findall(response_in_function_text) try: url_in_content=datare[0] except Exception as e: continue if 'http://m.thepaper.cn/channel_26916' in url: nexturl = 'http://www.thepaper.cn/load_index.jsp?' + url_in_content#发现手机端的数据获得地更多一些,电脑端http://m.thepaper.cn/load_channel.jsp? else: nexturl='http://m.thepaper.cn/load_channel.jsp?'+url_in_content thisurls_list.append(nexturl) for url_to_visit in thisurls_list: for i in range(10): self.index_url_list.append(url_to_visit+str(i)) def get_index_inside_movie(url): response2=get_response_and_text(url=url) response_in_function=response2['response_in_function'] response_in_function_text=response2['response_in_function_text'] if len(response_in_function_text)<10: return datasoup=BeautifulSoup(response_in_function_text,'lxml') for one_url in datasoup.select('body > div'): thisurl=one_url.select('h2 > a')[0].get('href') publish_user=one_url.select('a')[2].text title=one_url.select('a')[1].text try: publish_time=one_url.select('a > span')[0].text except: publish_time='00:00:00'#这些网页的格式不规则 try: publish_time_date=one_url.select('span')[1].text if u'天前' in publish_time_date: publish_time_date=publish_time_date.replace(u'天前','') date_now=datetime.now() date_now2=date_now-timedelta(days=int(publish_time_date)) publish_time_date=date_now2 publish_time_date=str(publish_time_date.strftime('%Y-%m-%d %H:%M')) elif u'小时前' in publish_time_date: publish_time_date = publish_time_date.replace(u'小时前', '') date_now = datetime.now() date_now2 = date_now - timedelta(hours=int(publish_time_date)) publish_time_date = date_now2 publish_time_date = str(publish_time_date.strftime('%Y-%m-%d %M:%H:%S')) elif u'分钟前' in publish_time_date: publish_time_date = publish_time_date.replace(u'分钟前', '') date_now = datetime.now() date_now2 = date_now - timedelta(minutes=int(publish_time_date)) publish_time_date = date_now2 publish_time_date = str(publish_time_date.strftime('%Y-%m-%d %M:%H:%S')) except Exception as e: # print e pass try: publish_time_date=one_url.select('span')[0].text except Exception as e: # print e,'两次都没有找到publish_time_data,在index视频处理部分' pass try: if len(one_url.select('span')[0].text)==10: publish_time_date=one_url.select('span')[0].text else: continue except: continue publish_time=publish_time_date+' '+publish_time+':00' id=one_url.select('h2 > a')[0].get('id') try: reply_count= one_url.select('span.trbszan')[0].text if 'k' in reply_count: reply_count=float(reply_count)*1000 except: reply_count= 0 video_urls=[] try: video_urls1=datasoup.select('video source') for i in video_urls1: video_urls.append(i.get('src')) except Exception as e: # print e pass data_index={ 'url':'http://m.thepaper.cn/'+thisurl, 'publish_user':publish_user, 'title':title, 'publish_time':publish_time, 'id':id, 'reply_count':reply_count, 'is_movie':True, 'spider_time':datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'video_urls':video_urls } self.content_data_list.append(data_index) def get_index_inside_wenben(url): user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36' headers = { 'User-Agent': user_agent } response1 = get_response_and_text(url=url,headers=headers) response_in_function=response1['response_in_function'] response_in_function_text=response1['response_in_function_text'] if not response_in_function: return datasoup = BeautifulSoup(response_in_function_text, 'lxml') for div_content in datasoup.select('body > div'): try: try: reply_count=div_content.select('span.reply')[0].text except Exception as e: # print e pass reply_count=0#因为有些新闻index请求中看不到评论消息。 url= 'http://m.thepaper.cn/' + div_content.select('div > a')[0].get('href') # url publish_time = div_content.select('p > span')[0].text # publish_time #这里需要对publish_time做处理吗? title= div_content.select('div > p > a')[1].text # title publish_user= div_content.select('div > p > a')[0].text # publish_user # print div_content if u'分钟' in publish_time: minulate = publish_time.replace(u'分钟前', '') time_b = datetime.now() - timedelta(minutes=int(minulate)) print time_b time_c = time_b.strftime('%Y-%m-%d %H:%M:%S') publish_time= time_c elif u'小时前' in publish_time: hourse = publish_time.replace(u'小时前', '') time_b = datetime.now() - timedelta(hours=int(hourse)) time_c = time_b.strftime('%Y-%m-%d %H:%M:%S') publish_time= time_c elif u'天前' in publish_time: days = publish_time.replace(u'天前', '') time_b = datetime.now() - timedelta(days=int(days)) time_c = time_b.strftime('%Y-%m-%d %H:%M:%S') publish_time= time_c print '\n\n\n' except Exception as e: # print e pass id=url.split('_')[-1] this_dict={ 'id':id, 'url':url, 'publish_time':publish_time, 'title':title, 'publish_user':publish_user, 'is_movie':False, 'reply_count':reply_count, 'spider_time':datetime.now().strftime('%Y-%m-%d %H:%M:%S') } self.content_data_list.append(this_dict) threadlist=[] # self.index_url_list=self.index_url_list.reverse() the_index_url_list_here=self.index_url_list#每次到这里就从新从self.index_url_list while the_index_url_list_here: # 如果index中的任务完了,content_url_list中是空的的时候,就停止 while the_index_url_list_here or threadlist: for threadi in threadlist: if not threadi.is_alive(): threadlist.remove(threadi) while len(threadlist) < CONTENT_THREADING_NUM and the_index_url_list_here: data_in_while = the_index_url_list_here.pop() if 'http://www.thepaper.cn/load_index.jsp?' in data_in_while: thread_in_while = threading.Thread(target=get_index_inside_movie, args=(data_in_while,)) else: thread_in_while=threading.Thread(target=get_index_inside_wenben,args=(data_in_while,)) thread_in_while.start() threadlist.append(thread_in_while) for childthread in threadlist: childthread.join(600) # print '正在index中等待那600秒' # time.sleep(1800) time.sleep(5*60) self.global_status_num_content=0 while True: self.global_status_num_content=0 time.sleep(5) if self.global_status_num_content==0: break
def get_content(data, result_queue): url = data['url'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } basic_url = 'http://bts.gov.cn/' Re_sub_javascript2 = re.compile(r'<script[\S|\s]*?>[\s|\S]*?<\/script\>') Re_find_time = re.compile(r'(\d{4}\-\d{2}\-\d{2} \d{2}\:\d{2}\:\d{2})') Re_find_img = re.compile(r'src\=\"(.*?)\"') Re_find_cource = re.compile(ur'来源:(.*?) ') try: response1 = get_response_and_text(url=url, headers=headers, charset='utf-8') response_in_function = response1['response_in_function'] if not response_in_function: return response_in_function_text = response1['response_in_function_text'] response_in_function_text_dealed = Re_sub_javascript2.sub( '', response_in_function_text) datasoup = BeautifulSoup(response_in_function_text_dealed, 'lxml') title = datasoup.select( 'body > div > div.detailMain.pageWidth > div.pargraph > h1' )[0].text content = '' for i in datasoup.select( 'body > div > div.detailMain.pageWidth > div.pargraph div.detailPar p' ): content += i.text # print content source = Re_find_cource.findall(response_in_function_text_dealed) if source: source = source[0] else: source = '' content_str = datasoup.select( 'body > div.mainBg > div.detailMain.pageWidth > div.pargraph > div.detailPar' )[0] content_str2 = str(content_str) img_urls = Re_find_img.findall(content_str2) img_urls2 = [] for one_img_url in img_urls: img_url_dealed = urljoin(basic_url, one_img_url) img_urls2.append(img_url_dealed) publish_div = datasoup.select( 'body > div > div.detailMain.pageWidth > div.pargraph > h6' )[0].text publish_time = Re_find_time.findall(publish_div)[0] data['content'] = content data['publish_time'] = publish_time data['img_urls'] = img_urls2 data['source'] = source data['publish_user'] = '' pass result_queue.put(data) except Exception as e: print e
def get_index_inside_movie(url): response2=get_response_and_text(url=url) response_in_function=response2['response_in_function'] response_in_function_text=response2['response_in_function_text'] if len(response_in_function_text)<10: return datasoup=BeautifulSoup(response_in_function_text,'lxml') for one_url in datasoup.select('body > div'): thisurl=one_url.select('h2 > a')[0].get('href') publish_user=one_url.select('a')[2].text title=one_url.select('a')[1].text try: publish_time=one_url.select('a > span')[0].text except: publish_time='00:00:00'#这些网页的格式不规则 try: publish_time_date=one_url.select('span')[1].text if u'天前' in publish_time_date: publish_time_date=publish_time_date.replace(u'天前','') date_now=datetime.now() date_now2=date_now-timedelta(days=int(publish_time_date)) publish_time_date=date_now2 publish_time_date=str(publish_time_date.strftime('%Y-%m-%d %H:%M')) elif u'小时前' in publish_time_date: publish_time_date = publish_time_date.replace(u'小时前', '') date_now = datetime.now() date_now2 = date_now - timedelta(hours=int(publish_time_date)) publish_time_date = date_now2 publish_time_date = str(publish_time_date.strftime('%Y-%m-%d %M:%H:%S')) elif u'分钟前' in publish_time_date: publish_time_date = publish_time_date.replace(u'分钟前', '') date_now = datetime.now() date_now2 = date_now - timedelta(minutes=int(publish_time_date)) publish_time_date = date_now2 publish_time_date = str(publish_time_date.strftime('%Y-%m-%d %M:%H:%S')) except Exception as e: # print e pass try: publish_time_date=one_url.select('span')[0].text except Exception as e: # print e,'两次都没有找到publish_time_data,在index视频处理部分' pass try: if len(one_url.select('span')[0].text)==10: publish_time_date=one_url.select('span')[0].text else: continue except: continue publish_time=publish_time_date+' '+publish_time+':00' id=one_url.select('h2 > a')[0].get('id') try: reply_count= one_url.select('span.trbszan')[0].text if 'k' in reply_count: reply_count=float(reply_count)*1000 except: reply_count= 0 video_urls=[] try: video_urls1=datasoup.select('video source') for i in video_urls1: video_urls.append(i.get('src')) except Exception as e: # print e pass data_index={ 'url':'http://m.thepaper.cn/'+thisurl, 'publish_user':publish_user, 'title':title, 'publish_time':publish_time, 'id':id, 'reply_count':reply_count, 'is_movie':True, 'spider_time':datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'video_urls':video_urls } self.content_data_list.append(data_index)
def get_content_in_wenda_comments_comments(id_replynodes, data=None): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } try: if not id_replynodes['next_comment_url']: url_comments_more = 'https://www.wukong.com/wenda/web/comment/brow/?ansid=' + \ id_replynodes['id'] + '&count=10&offset=0' needproxy = int(random.randint(1, 10) / 7) response1 = get_response_and_text(url=url_comments_more, needproxy=needproxy) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] else: needproxy = int(random.randint(1, 10) / 7) response1 = get_response_and_text( url=id_replynodes['next_comment_url'], headers=headers, needproxy=needproxy) response_in_function = response1['response_in_function'] response_in_function_text = response1[ 'response_in_function_text'] # break except Exception as e: print e datajson_comment2 = json.loads(response_in_function_text) try: datajson_comment2['comments'] except Exception as e: print e for comment2 in datajson_comment2['comments']: id = comment2['comment_id'] like_count = comment2['digg_count'] content = comment2['content'] publish_user_id = comment2['user_info']['user_id'] publish_user = comment2['user_info']['uname'] publish_user_photo = comment2['user_info']['avatar_url'] publish_time = comment2['create_time'] publish_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(publish_time))) try: ancestor_id = id_replynodes['ancestor_id'] except Exception as e: print e parent_id = id_replynodes['id'] thisnode = { 'id': id, 'like_count': like_count, 'content': content, 'publish_user_id': publish_user_id, 'publish_user': publish_user, 'publish_user_photo': publish_user_photo, 'publish_time': publish_time, #发布时间 'parent_id': parent_id, 'ancestor_id': ancestor_id, # 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } id_replynodes['reply_nodes'].append(thisnode) if datajson_comment2['has_more']: url_offset = response_in_function.url.split('&offset=') offset = int(url_offset[1].split('&')[0]) + 10 url = url_offset[0] + '&offset=' + str(offset) id_replynodes['next_comment_url'] = url reply_nodes2 = get_content_in_wenda_comments_comments( id_replynodes) return reply_nodes2 else: return id_replynodes['reply_nodes']
def get_content_inside_no_movie(data): url_for_debug=data['url'] vedio_list=[] respons1=get_response_and_text(url=url_for_debug) response_in_function=respons1['response_in_function'] response_in_function_text=respons1['response_in_function_text'] if not response_in_function: return Re_find_img=re.compile(r'src=".*?"') datasoup=BeautifulSoup(response_in_function_text,'lxml') content='' img_urls=[] for content_in_soup in datasoup.select('#v3cont_id > div.news_content > div.news_part'): content+=content_in_soup.text for content_in_soup in datasoup.select('#v3cont_id > div.news_content > div.news_part_father > div.news_part.news_part_limit > div'): content+=content_in_soup.text try: title=datasoup.select('#v3cont_id > div.news_content > h1')[0].text except: # print response_in_function.url pass # title='' return #这里偶尔会转到莫名其妙的网页 try: publish_user= datasoup.select('#v3cont_id > div.news_content > p.about_news')[0].text except Exception as e: # print e pass try: source=datasoup.select('#v3cont_id > div.news_content > p.about_news')[1].text.split(u'来源:')[1] except: source='' try: publish_time=datasoup.select('.news_content .about_news')[1].text.split(u'\xa0')[0]+':00' data['publish_time']=publish_time except Exception as e: # print e pass for i in datasoup.select('source'): url_vedio= i.get('src') vedio_list.append(url_vedio) # publish_time= datasoup.select('#v3cont_id > div.news_content > p.about_news')[1].text.strip()#还是有一个莫名奇妙的空格 try: datasoup_content=datasoup.select('#v3cont_id > div.news_content')[0] except Exception as e: # print e pass img_urls_original=Re_find_img.findall(str(datasoup_content)) img_urls_selected_by_doup=datasoup_content.select('img') for url in img_urls_selected_by_doup: print url.get('src') for url in img_urls_original: url_split=url.split('"')[1] img_urls.append(url_split) # if len(publish_time)> 17: # publish_time=publish_time.split('\n')[0] #8-30 like_count = datasoup.select('#news_praise') if like_count: like_count_value = int(like_count[0].text.strip()) else: like_count_value = 0 #8-30 data['like_count']=like_count_value data['publish_user']=publish_user data['img_urls']=img_urls data['content']=content # data['publish_user']=publish_user # data['publish_time']=publish_time data['title']=title data['source']=source data['video_urls'] = vedio_list self.comments_url_list.append(data)