def deal_content_from_news(self,response): # print response.body data_TCPI= gather_all_funtion.get_result_you_need(response) content=data_TCPI[1] title=data_TCPI[0] publish_time=data_TCPI[2] img_urls=data_TCPI[3] print content print title print publish_time print img_urls
def deal_content2(self, response): # print response.url # Save_org_file(plantform='sohu', date_time=response.meta['publish_time'], urlOruid=response.url, # newsidOrtid=response.meta['id'], datatype='news', full_data=response.body) # Save_zip(plantform='sohu', date_time=response.meta['publish_time'], urlOruid=response.url, # newsidOrtid=response.meta['id'], datatype='news') data_TCPI = gather_all_funtion.get_result_you_need(response) content = data_TCPI[1] # publish_time=data_TCPI[2] img_urls = data_TCPI[3] # time_format = '%Y-%m-%d' # spider_time = time.strftime(time_format, time.localtime()) # publish_time=time.strftime(time_format,time.localtime(float(response.meta['publish_time']))) # print response.body data = response.meta['data'] data['content'] = content data['reply_nodes'] = [] data['img_urls'] = img_urls Re_find_comment_id = re.compile(r'cms_id: \'.*?\'') try: comment_id = Re_find_comment_id.findall(response.body) print content print '\n' print data_TCPI[0] comment_id_find_by_re = comment_id[0] comment_id_find_by_re = comment_id_find_by_re.split("\'")[1] #https://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=3500748995&page_no=2 url_to_comments = 'https://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=' + str( comment_id_find_by_re) + '&page_no=2' yield scrapy.Request(url=url_to_comments, headers=response.headers, meta={ 'plant_form': 'None', 'data': data, 'download_timeout': 3, 'isIndex_request': True }) except Exception as e: print e
def deal_content(self, response): #台海网有下一页,此功能代码还没有设计 # print response data = gather_all_funtion.get_result_you_need(response) # for element in gather_all_funtion content = data[1] # imglist=data[1] # publish_user=response.xpath('/html/body/div[3]/span[2]/span/a')#没有抓取到发帖人,后边单独写一个模块 time_format = '%Y-%m-%d' spider_time = time.strftime(time_format, time.localtime()) publish_time = data[2] img_urls = data[3] if len(publish_time.split(':')) == 2: publish_time += ':00' else: print publish_time publish_time = '2211-11-11 11:11:11' data = {} data['url'] = response.url data['title'] = response.meta['data']['title'] data['id'] = response.meta['data']['id'] data['url'] = response.meta['data']['url'] data['spider_time'] = spider_time data['img_urls'] = img_urls data['publish_time'] = publish_time # data['publish_user']=publish_user data['content'] = content print '\n\n' persionalSetting.Save_org_file('taihainet', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=response.body) persionalSetting.Save_result(plantform='taihainet', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data={'data': data})
def deal_content(self, response): if response.status == 404: return #这样设计靠谱吗? if response.request.cookies: cookies = response.request.cookies else: cookies = {} headers = response.request.headers if 'Set-Cookie' in headers.keys(): print response.headers['Set-Cookie'] for headers_key in response.headers.keys(): if 'Set-Cookie' in headers_key: set_cookie = response.headers[headers_key] cookies_name = set_cookie.split(';')[0].split('=') cookies[cookies_name[0]] = cookies_name[1] else: headers[headers_key] = response.headers[headers_key] data_TCPI = gather_all_funtion.get_result_you_need(response) title = data_TCPI[0] content = data_TCPI[1] img_urls = data_TCPI[3] time_format = '%Y-%m-%d' spider_time = time.strftime(time_format, time.localtime()) # try: # publish_time=response.xpath('/html/body/div[4]/div[2]/p/span[4]') # print publish_time # except Exception as e: # print e # print 'time wrong' publish_time = data_TCPI[2] id = str(response.url.split('/')[-1]) data = { 'url': response.url, 'content': content, 'title': title, 'publish_time': publish_time, 'img_urls': img_urls, 'id': id, 'spider_time': spider_time, 'reply_node': [] } # cmt_url='http://changyan.sohu.com/api/2/topic/comments?client_id=cyrHnxhFx&page_size=30&topic_id=630645353&page_no=1' Re_find_sid = re.compile(r'sid="\d*?"') sid = Re_find_sid.findall(response.body) #为了找到评论 print sid try: sidnum = sid[0].split('"')[1] cmt_url_with_out_num = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cyrHnxhFx&page_size=30&hot_size=5&topic_source_id=' cmt_url_to_visit = cmt_url_with_out_num + sidnum yield scrapy.Request(url=cmt_url_to_visit, headers=headers, cookies=cookies, meta=data) # Save_result(plantform='chengdu',date_time=publish_time,urlOruid=response.url,newsidOrtid=id,datatype='news',full_data=data) print data #有可能同样的正则表达式同样的网页但是就是没有找到对应的sid except Exception as e: print e
def deal_content(self, response): if response.meta['isNextPage'] == False: data_TCPI = gather_all_funtion.get_result_you_need(response) print data_TCPI content = data_TCPI[1] data = response.meta # data['content']=content data['data']['content'] = content #发现发布时间里边有'刚刚,1小时前,2小时前,3小时前,分钟前' publish_time = response.meta['data']['publish_time'] if publish_time == u'刚刚': publish_time = time.time() elif u'小时前' in publish_time: time_pass = int(publish_time.replace(u'小时前', '')) * 60 * 60 publish_time = time.time() - time_pass elif u'分钟前' in publish_time: time_pass = int(publish_time.replace(u'分钟前', '')) * 60 publish_time = time.time() - time_pass elif '-' in publish_time and len(publish_time) == 5: publish_time = '2017-' + publish_time response.meta['data']['publish_time'] = publish_time else: data_TCPI = gather_all_funtion.get_result_you_need(response) content1 = data_TCPI[1] response.meta['data']['content'] += content1 pass #body > div.scrollBox.mt10 > div.article > div.mb10.mt5.fs14 > a.page-next.ml5 next_page_selector = response.css( 'body > div.scrollBox.mt10 > div.article > div.mb10.mt5.fs14 > a.page-next.ml5' ) if next_page_selector: next_page_html = next_page_selector.get('href') if next_page_html and 'href' in next_page_html and len( next_page_html.split('"')[1]) > 3: #<a href="/v/1000010001000802_2.html" style="color: #069700;" class="page-next ml5">下一页</a> if response.request.cookies: cookies = response.request.cookies else: cookies = {} headers = response.request.headers if 'Set-Cookie' in response.headers.keys(): print response.headers['Set-Cookie'] for headers_key in response.headers.keys(): if 'Set-Cookie' in headers_key: set_cookie = response.headers[headers_key] cookies_name = set_cookie.split(';')[0].split('=') cookies[cookies_name[0]] = cookies_name[1] else: headers[headers_key] = response.headers[ headers_key] next_page_url = next_page_html.split('"')[1] next_url = 'http://m.xilu.com' + next_page_url print next_url response.meta['isNextPage'] = True yield scrapy.Request(url=next_url, headers=headers, meta=response.meta, cookies=cookies, priority=2) else: # 如果下一页为空,那么就不跟进下一页,执行后边的else中的评论获取,所以这里的else中的代码和后边else中的代码一致,都是进入评论的链接。 cmt_url_without_id = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cysYw3AKM&page_size=30&hot_size=10&topic_source_id=' this_page_id = response.url.split('/')[-1].split('.')[0] cmt_url = cmt_url_without_id + this_page_id yield scrapy.Request(url=cmt_url, headers=response.headers, meta=response.meta, priority=2) else: cmt_url_without_id = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cysYw3AKM&page_size=30&hot_size=10&topic_source_id=' this_page_id = response.url.split('/')[-1].split('.')[0] cmt_url = cmt_url_without_id + this_page_id yield scrapy.Request(url=cmt_url, headers=response.headers, meta=response.meta, priority=2)
def deal_content(self, response): if '<div riot-tag="player"></div>' in response.body and '<div riot-tag="abstract"></div>' in response.body and '<div riot-tag="hotvideo"></div>' in response.body: return #因为这里边是视频 if 'toutiao' not in response.url: return #这里可以找一个图片的判断语句,用来判断是否是图片,因为有些其它没图片板块,传过来的也可能是图片模块 Re_find_pattern1 = re.compile(r'\bvar gallery =.*?\]\}') if response.request.cookies: cookies = response.request.cookies else: cookies = {} headers = response.request.headers if 'Set-Cookie' in headers.keys(): print response.headers['Set-Cookie'] for headers_key in response.headers.keys(): if 'Set-Cookie' in headers_key: set_cookie = response.headers[headers_key] cookies_name = set_cookie.split(';')[0].split('=') cookies[cookies_name[0]] = cookies_name[1] else: headers[headers_key] = response.headers[headers_key] thismeta = response.meta print response.headers #针对/i和/a这两种网页来单独设计xpathget的redis库 # Re_find_urlwithnum=re.compile(r'toutiao.com/\d') # if 'toutiao.com/i' in response.url: # response.meta['plant_form']='toutiao_i' # elif 'toutiao.com/a' in response.url: # response.meta['plant_form']='toutiao_a' # elif Re_find_urlwithnum.findall(response.url): # response.meta['plant_form']='toutiao' #----------------------------7-19日添加的内容处理模块 #添加的一个图片模块识别代码 img_urls = [] content = '' pictureinfo = Re_find_pattern1.findall(response.body) if pictureinfo: #如果是图片,自然进入这个模块,这样也相当于给下xpath减轻了压力了 picture_data = pictureinfo[0] picture_data_json_original = picture_data.split('=')[1] datajson = json.loads(picture_data_json_original) for picture_info in datajson['sub_images']: img_url_in_for = picture_info['url'] img_urls.append(img_url_in_for) for content_info in datajson['sub_abstracts']: content += content_info title = datajson['sub_titles'][0] # Re_find_publish_time=re.compile(r'') Re_find_time = re.compile(r'publish_time:.*?\,') publish_time = Re_find_time.findall( response.body)[0].split("'")[1].replace('/', '-') else: data_TCPI = get_result_you_need(response) content = data_TCPI[1] img_urls = data_TCPI[3] publish_time = data_TCPI[2] # title=response.xpath('//*[@id="article-main"]/h1').extract() # content='' # for i in response.xpath('//*[@id="article-main"]/div[2]/div/p[2]').extract(): # content+=i # publish_time=response.xpath('//*[@id="article-main"]/div[1]/span[@class="time"]').extract() if response.meta['special_key'] == 'is_picture': Re_find_content_in_html = re.compile( r'\bgallery: .*?siblingList\b') Re_find_content_in_html.findall(response.body) if len(content) < 10: Re_find_content_in_html = re.compile( r'\bgallery: .*?siblingList\b') # response.meta['plant_form']='toutiao' data = { 'id': thismeta['id'], 'url': thismeta['url'], 'reply_count': thismeta['reply_count'], 'title': thismeta['title'], 'publish_user': thismeta['publish_user'], 'spider_time': thismeta['spider_time'], 'publish_user_photo': thismeta['publish_user_photo'], 'content': content, 'img_urls': [], 'video_urls': [], 'publish_time': publish_time, 'reply_nodes': [], } #http://www.toutiao.com/api/comment/list/?group_id=6438917736949612802&item_id=6438920814917059074&offset=5&count=15 if '.toutiao.com' in response.url: print response.body xpath_data = response.xpath('//div/article/div[1]/h1') print response #下边都是处理转到对应的评论所需要的信息. Re_content_item_id = re.compile(r'item_id: \'.*?\'') Re_content_qid = re.compile(r'qid : \".*?\"') # print response.body#普通的html文档 item_id_re = Re_content_item_id.findall(response.body) print item_id_re if not item_id_re: qid_re = Re_content_qid.findall(response.body) print qid_re[0].split('"')[1] #这里的作用是找出文中对应的id部分. # yield scrapy.Request() else: print item_id_re[0].split("'")[1] thisurl = response.url.split('com/a') nexturl = 'http://www.toutiao.com/api/comment/list/?group_id=' + thisurl[ 1].replace('/', '') + '&item_id=' + str( item_id_re[0].split("'")[1]) + '&offset=0&count=20' yield scrapy.Request(url=nexturl, cookies=cookies, headers=headers, meta={ 'data': data, 'plant_form': 'toutiao', 'isIndex_request': False })