def parse_article(self, response): logger.info("{} Url {}".format(get_function_name(), response.url)) scripts = response.xpath('//script/text()').extract() for script in scripts: script = script.lstrip() # 1. 找到记录数据的script, 提取content字段 if script.startswith('var BASE_DATA'): script.replace("\n", "") script = re.sub(self.match_space, "", script) start_tag = "content:'" start_idx = script.find(start_tag) end_idx = script.find("',", start_idx) if start_idx != -1 and end_idx != -1: script = script[start_idx + len(start_tag):end_idx] # 2. 获取content转义后提取文本 data = html.unescape(script) soup = BeautifulSoup(data, 'lxml') texts = "\r\n".join([p.text for p in soup.select('p')]) if len(texts.strip()) == 0: continue # 3. 记录文本 item = ToutiaoItem() item['url'] = response.url item['field'] = time.strftime( "%Y%m%d") + os.sep + response.meta['field'] item['title'] = time.strftime("%Y%m%d_%H%M%S_") + str( random.randint(0, 1000)) item['content'] = texts print('dump item: ', item['url']) yield item
def parse(self, response): item = ToutiaoItem() list_selector = response.xpath("//div[@class='wcommonFeed']/ul/li") for li in list_selector: try: #标题 title = li.xpath(".//a[@class='link title']/text()").extract() #去除空格 title = title[0].strip(" ") #来源 source = li.xpath( ".//a[@class='lbtn source']/text()").extract() #去除点号和全角空格 source = source[0].strip("⋅").strip(" ") #评论数 comment = li.xpath(".//a[@class='lbtn comment']/text()") #去除文字及空格 comment = comment.re("(.*?)评论")[0] comment = "".join(comment.split()) #去除空格:  item["title"] = title #标题 item["source"] = source #来源 item["comment"] = comment #评论数 yield item except: continue
def second(self, response): item = ToutiaoItem() # print(response.url) # 文章标题 title_re = re.compile('<title>(.*?)</title>', re.S) article_title = re.findall(title_re, response.text) if article_title: article_title = ''.join(article_title) # 文章内容 content_re = re.compile(",\n content: \'(.*?)',\n groupId", re.S) article_content = re.findall(content_re, response.text) if article_content: article_content = article_content[0] article_content = article_content.replace('<', '<').replace( '>', '>').replace('=', '=').replace('"', '"') # image_re = re.compile('') # doc = pq(article_content) # article_content = doc('p').text() # https://www.cnblogs.com/lei0213/p/7676254.html # print(article_content) # with open('detail.txt', 'a', encoding='utf-8') as f: # f.write(article_content+'\n') item['article_url'] = response.url item['article_title'] = article_title item['article_content'] = article_content yield item
def parse(self, response): # 转换为python格式 data = json.loads(response.text)['data'] for c in data: item = ToutiaoItem() each = c['comment'] # 用户名称 item['name'] = each['user_name'] #评论内容 item['content'] = each['text'] # 点赞数量 item['digg_count'] = each['digg_count'] # 回复数量 item['reply_count'] = each['reply_count'] yield item self.page += 20 yield scrapy.Request(self.url + str(self.page), callback=self.parse)
def parse(self, response): # self.log(response.text) title = response.css('.article-title::text').extract_first() date = response.css('.article-sub span+ span').extract_first() content = ''.join(response.css('.article-content p::text').extract()) abstract = content[:140] if title is not None and date is not None and abstract is not None: title = title.strip() date = date.strip() date = self.reg_filter.sub('', date) abstract = abstract.strip() if title != '' and date != '' and abstract != '': item = ToutiaoItem() item["title"] = title item["abstract"] = abstract item["url"] = response.url item["date"] = date item["content"] = content item["mediaName"] = self.mediaName item["keyword"] = response.meta["keyword"] yield item
def parse(self, response): result = json.loads(response.text) print('开始') headers = self.get_headers() if result.get('data'): item = ToutiaoItem() list = [] for content in result.get('data'): get = content re_compile = re.compile('"Abstract":"(.*?)","abstract"') match = re.findall(re_compile, str(get)) if match[0].find('壁纸') != -1: re_compile = re.compile( '.*?"url_list":.{"url":"(http://sf\d-ttcdn-tos.pstatp.com/img/pgc-image/.*?~400x400_c5.webp)"},{"url":"' ) findall = re.findall(re_compile, str(get)) #1. 一张一张下 # if len(findall)<=4: # for url in findall: # item['name']=match[0] # item['img_url'] =url # yield item # 2. 一篇一篇下 if len(findall) <= 4: item['name'] = match[0] item['img_url'] = findall yield item # list.append(findall) # item['img_url'] = list # item['img_url'] = findall # yield item offset = result.get('offset') yield Request(url=self.url.format(offset=offset), callback=self.parse, headers=headers, dont_filter=True)
def parse(self, response): # 解析json格式数据 data = json.loads(response.text)['data'] for i in data: # 每一项评论创建一个Item实例 item = ToutiaoItem() comment = i['comment'] item['Num'] = self.Num item['text'] = comment['text'] item['name'] = comment['user_name'] item['like'] = comment['digg_count'] item['reply'] = comment['reply_count'] # 打印测试结果 print(u'\n') print(u'序号:', item['Num']) print(u'评论:', item['text']) print(u'名字:', item['name']) print(u'点赞:', item['like']) print(u'回复:', item['reply']) # Num设置为全局变量,方便查看进度 self.Num += 1 return item
def parse(self, response): #这里的response好像不是中间件中返回的HtmlResponse???看下一行 #经过测试,中间件返回的HtmlResponse会传到这里的解析函数parse中 #之前测试一直错误是因为中间件中xpath书写错误导致并没有返回HtmlResponse,而是print('错误消息') #说明是返回的None,说明并没有截断请求,response还是原页面的源代码,原页面是通过AJAX加载的,所以一直获取不到元素 selector_list = response.xpath('//div[@class="wcommonFeed"]/ul/li') # print(selector_list,'*'*100) # print(len(response.text),'*'*100) for li in selector_list: try: item = ToutiaoItem() item["title"] = li.xpath( './/a[@class="link title"]/text()').extract()[0].strip() item["source"] = li.xpath( './/a[@class="lbtn source"]/text()').extract()[0].strip( '·').strip() #这里的点号一直没去掉,可以尝试从源代码中把那个点复制粘贴出来试试 comment = li.xpath( './/a[@class="lbtn comment"]/text()').extract()[0].strip() item["comment"] = re.findall('(\d+?)评论', comment)[0] yield item except: continue #如果出错了,就跳过这个item,继续下一个item的抽取
def parse(self, response): item = ToutiaoItem() list_selectors = response.xpath("//div[@class='wcommonFeed']/ul/li") for li in list_selectors: try: title = li.xpath(".//a[@class='link title']/text()").extract() title = title[0].strip(" ") print("title:", title) source = li.xpath(".//a[@class='lbtn source']/text()").extract() source = source[0].strip("⋅").strip(" ") print("source:", source) comment = li.xpath(".//a[@class='lbtn comment']/text()") comment = comment.re("(.*?)评论")[0] comment = "".join(comment.split()) print("comment:", comment) item['title'] = title item['source'] = source item['comment'] = comment yield item except: continue
def parse(self, response): js = json.loads(response.body.decode('utf-8')) data = js['data'] # cookie = response.headers.getlist('Set-Cookie')[0].split(';')[0] # print(cookie) for d in data: image_list = d.get('image_list') if image_list: for item in image_list: toutiao_item = ToutiaoItem() url = item.get('url') new_image_url = url.replace('list', 'large').replace( '/190x124', '') pic_id = hashlib.md5( new_image_url.encode("utf8")).hexdigest() result = collection.find_one({"pic_id": pic_id}) if (result): print("continue") continue else: toutiao_item["pic_id"] = pic_id toutiao_item["pic_link"] = new_image_url toutiao_item["pic_desc"] = self.getPicDesc() yield toutiao_item
def parse(self, response): item = ToutiaoItem() list_selector = response.xpath("//div[@class='single-mode-rbox-inner']") for div in list_selector: try: # 标题 title = div.xpath("./div[@class='title-box']/a/text()").extract() # 去除标题空格 title = title[0].strip(" ") # 来源 source = div.xpath("./div[@class='footer-bar']/div[1]/a[2]/text()").extract() # 去除点号与全角空格 source = source[1].strip("·").strip(" ") # 评论数 comment = div.xpath("./div[@class='footer-bar']/div[1]/a[3]/text()").extract() comment = comment[1] item["title"] = title item["source"] = source item["comment"] = comment print("item:" + item) yield item except: continue
def parse1(self, response): items = ToutiaoItem() #类型 items['class_id'] = '1' # 文章ID items['article_id'] = str(response.meta['article_id']) #标题 items['title'] = response.xpath( '//*[@id="app"]/div[1]/div/div[1]/div/div[1]/h2').xpath( 'string(.)').extract_first() #作者 items['writer'] = response.xpath( '//*[@id="app"]/div[1]/div/div[1]/div/div[2]/a').xpath( 'string(.)').extract_first() #发表时间 items['publish_time'] = str(response.meta['publish_time']) # 浏览量 items['read_number'] = str(response.meta['read_number']) # 导读 items['summary'] = str(response.meta['summary']) #内容 if response.css('div.js-article-detail p').extract(): article = response.css('div.js-article-detail p').extract() article = ''.join(article) #关键字 try: tags = response.xpath('//div[@class="tags"]/a').xpath( 'string(.)').extract() items['tags'] = ','.join(tags) except: pass time.sleep(1) #图片下载 # 根据日期创建文件夹 file_day = items['publish_time'].split('/')[0] try: os.mkdir('img/tt_img/' + file_day) except: pass time.sleep(1) # 遍历img文件夹下的所有日期文件夹 path = r'E:\安培斯通\amber-spider\\toutiao\img\\tt_img' # path = r'/home/amber-spider/toutiao/img/tt_img' date_files = os.listdir(path) for file in date_files: fi = os.path.join(path, file) fil = os.path.join(path, file).split('/')[-1] if fil == file_day: try: os.mkdir(fi + '/' + items['article_id']) except: pass if response.css( 'div.js-article-detail p img::attr(src)').extract(): img_url = response.css( 'div.js-article-detail p img::attr(src)').extract() time.sleep(1) try: for i in img_url: #截取原图的URL地址 url = i.split('_')[0] request.urlretrieve( url, fi + '/' + items['article_id'] + '/' + url.split("/")[-1] + '.jpg') time.sleep(1) except: pass # 拼接oss上传路径 oss_url = 'https://capital-future-imgs.oss-cn-beijing.aliyuncs.com/tt_img/' + fil + '/' + items[ 'article_id'] + '/' if 'https://img.jinse.com/' and '_image3.png' in article: items['article'] = article.replace( 'https://img.jinse.com/', oss_url).replace( '_image3.png', '.jpg?x-oss-process=image/resize,l_500') elif 'https://img.jinse.com/' and '_watermarknone.png' in article: items['article'] = article.replace( 'https://img.jinse.com/', oss_url).replace( '_watermarknone.png', '.jpg?x-oss-process=image/resize,l_500') else: items['article'] = article # 内容里的图片地址 if 'https://capital-future-imgs.oss-cn-beijing.aliyuncs.com/' in items[ 'article']: oss_url1 = items['article'].split('src=')[1].split( '.jpg')[0].replace( '"', '') + '.jpg?x-oss-process=image/resize,l_500' items['oss_url1'] = oss_url1 else: items[ 'oss_url1'] = 'https://capital-future-imgs.oss-cn-beijing.aliyuncs.com/images/new_default.png?x-oss-process=image/resize,l_500' yield self._clean_str(items) # 根据时间终止爬虫 # 爬取时间 array_time1 = time.strptime(items['publish_time'], "%Y-%m-%d/%H:%M:%S") crawl_time = time.mktime(array_time1) # 获取当前时间戳 now = datetime.datetime.now() # 当前时间减去1天(1天=86400秒) sched_timer = str( datetime.datetime(now.year, now.month, now.day, now.hour, now.minute, now.second) - datetime.timedelta(seconds=86400 * 5)) array_time = time.strptime(sched_timer, "%Y-%m-%d %H:%M:%S") now_time = time.mktime(array_time) if crawl_time < now_time: self.crawler.stop()
def sub_nav(self, response): page = Selector(response) # print(response.text) # 所有子标签的url sub_nav_tips1 = page.xpath( '//div[@class="channel"]/ul/li/a/@href').extract() del sub_nav_tips1[:2], sub_nav_tips1[-1], sub_nav_tips1[1] sub_nav_tips2 = page.xpath( '//div[@class="channel-more-layer"]/ul/li/a/@href').extract() sub_nav_tips = sub_nav_tips1 + sub_nav_tips2 # print(sub_nav_tips) #子标签的名字 sub_names1 = page.xpath( '//div[@class="channel"]/ul/li/a/span/text()').extract() del sub_names1[:2], sub_names1[-1], sub_names1[1] sub_names2 = page.xpath( '//div[@class="channel-more-layer"]/ul/li/a/span/text()').extract( ) sub_names = sub_names1 + sub_names2 # print(sub_names) # 每个子标签遍历 for i in range(0, len(sub_nav_tips)): items = [] # 请求子标签页面 self.brower.get('https://www.toutiao.com' + sub_nav_tips[i]) # 返回秒时间戳 now = round(time.time()) # 获取signature加密数据 signature = self.brower.execute_script('return TAC.sign(' + str(now) + ')') # print(signature) # 获取cookie cookie = self.brower.get_cookies() cookie = [item['name'] + "=" + item['value'] for item in cookie] cookiestr = '; '.join(item for item in cookie) # print(cookiestr) header1 = { 'Host': 'www.toutiao.com', 'User-Agent': '"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"', # 'Referer': 'https://www.toutiao.com/ch/news_hot/', "Cookie": cookiestr } send_data = { 'category': sub_nav_tips[i][4:-1], 'utm_source': 'toutiao', 'widen': '1', 'max_behot_time': now, '_signature': signature } # 拼接ajax URL url = self.ajax_url_base + urlencode(send_data) # print(url) html = requests.get(url, headers=header1, verify=False) # 返回json数据,解析 json_datas = json.loads(html.text)['data'] # print(json_datas) for json_data in json_datas: item = ToutiaoItem() # print(type(json_data)) item['title'] = json_data['title'] # 有的字段为空 try: item[ 'source_url'] = 'https://www.toutiao.com/a' + json_data[ 'source_url'][7:] except: item['source_url'] = '' try: item['abstract'] = json_data['abstract'] except: item['abstract'] = '' try: item['source'] = json_data['source'] except: item['source'] = '' try: item['tag'] = json_data['tag'] except: item['tag'] = '' try: item['chinese_tag'] = json_data['chinese_tag'] except: item['chinese_tag'] = '无标签类别' item['news_class'] = sub_names[i] yield item self.brower.quit()