def parse_by_selector1(self, response): print(response.url) items = [] movies = Selector(response=response).xpath('//div[@class="hd"]') for movie in movies: title = movie.xpath('./a/span/text()') link = movie.xpath('./a/@href') movie_item = MoviespiderItem() movie_item['title'] = title.extract_first() movie_item['link'] = link.extract_first() yield scrapy.Request(url=link.extract_first(), callback=self.parse_detail_by_selector, meta={'item': movie_item})
def parse(self, response): items = [] soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'hd'}) for item in title_list: movie_item = MoviespiderItem() title = item.find('a').find('span', ).text link = item.find('a').get('href') movie_item['title'] = title movie_item['link'] = link # items.append(movie_item) yield scrapy.Request(url=link, meta={'item': movie_item}, callback=self.parse_detail)
def parse(self, response): movie_list_group = response.xpath('//div[@class="info"]') for movie_list in movie_list_group: item = MoviespiderItem() item['movie_title'] = movie_list.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract() item['movie_other_title'] = movie_list.xpath('div[@class="hd"]/a/span[@class="other"]/text()').extract() item['movie_link'] = movie_list.xpath('//div[@class="hd"]/a/@href')[0].extract() item['movie_director_actor'] = movie_list.xpath('div[@class="bd"]/p[@class=""]/text()').extract() item['movie_star'] = movie_list.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0].extract() item['movie_quote'] = movie_list.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() url = movie_list.xpath('//div[@class="hd"]/a/@href')[0].extract() yield scrapy.Request(url, callback=self.parse_movie_list_detail, dont_filter=True, meta={'item': item}) next_page = response.xpath('//span[@class="next"]/a/@href').extract() if next_page: print(next_page) yield scrapy.Request(self.main_url + next_page[0], callback=self.parse)
def parse_item(self, response): item = MoviespiderItem() article = response.css('div.article_container') name = article.css('h1::text').extract_first() image = article.css('.context img::attr(src)').extract_first() link = response.url ctime = article.css('.article_info .info_date::text').extract_first() category = article.css( '.article_info .info_category a::text').extract_first() description = article.css('div[id=post_content]').extract_first() pan = response.css('.context h2').extract()[-1] item['name'] = name item['image'] = image item['link'] = link item['ctime'] = ctime item['category'] = category item['description'] = description item['pan'] = pan yield item
def parselist(self, response): li_list = response.xpath('//div[@class="listBox"]//ul/li') for li in li_list: # 创建一个电影的item item = MoviespiderItem() # 取出电影名字 name = li.xpath('.//h3/a/text()').extract_first("") # 电影的类型 type = li.xpath(".//p[2]/text()").extract_first("") # 每个电影的详情页 href = li.xpath('.//h3/a/@href').extract_first("") item["name"] = name item["type"] = type.split(":")[1] item["href"] = href yield scrapy.Request(url=href, callback=self.parsedetail, meta={"item": item})
def parse(self, response): # 从响应体中提取出所有的电影信息 movie_list = response.xpath("//div[@class='co_content8']//table") print(movie_list) # 遍历所有的电影,提取出详细的信息 for movie in movie_list: # 创建一个模型 item = MoviespiderItem() # 用item提取一级页面中的内容 item["title"] = movie.xpath(".//a/text()").extract_first() item['date'] = movie.xpath(".//font/text()").extract_first() item['info'] = movie.xpath( ".//tr[last()]/td/text()").extract_first() # 获取二级页面中的内容 next_url = "http://www.dytt8.net" + movie.xpath( ".//a/@href").extract_first() # 此时需要继续从二级页面中提取信息,就需要调用下载器继续下载 yield scrapy.Request(url=next_url, callback=self.parse_next, meta={"movie_item": item})
def parse_1(self, response): item = MoviespiderItem() # 目标网站所有影片信息放在一个p标签中,相邻p标签中存放'简介'字段 tag_p_1 = response.xpath( "//div[@class='article-container post clearfix']/article/p[1]/text()" ).extract() # 此处不可以为None if tag_p_1 == []: tag_p = response.xpath( "//div[@class='article-container post clearfix']/article/p[2]/text()" ).extract() intro_list = response.xpath( "//div[@class='article-container post clearfix']/article/p[2]/following-sibling::p/text()" ).extract() else: tag_p = response.xpath( "//div[@class='article-container post clearfix']/article/p[1]/text()" ).extract() intro_list = response.xpath( "//div[@class='article-container post clearfix']/article/p[1]/following-sibling::p/text()" ).extract() # 分解p标签中的内容 info_dict = {} for i in tag_p: s = i.split(':') info_dict[s[0].strip()] = s[1].strip() for key in info_dict: if key == '导演': item['director'] = info_dict[key] elif key == '编剧': item['writer'] = info_dict[key] elif key == '主演': item['actor'] = info_dict[key] elif key == '制片国家/地区': item['area'] = info_dict[key] elif key == '语言': item['language'] = info_dict[key] elif key == '又名': item['other_name'] = info_dict[key] elif key == '上映日期': item['release_date'] = info_dict[key] elif key == '片长': item['lenght'] = info_dict[key] elif key == '首播': item['debut'] = info_dict[key] elif key == '集数': item['cd'] = info_dict[key] elif key == '单集片长': item['min'] = info_dict[key] elif key == '类型': item['type'] = info_dict[key] else: pass # 片名 title = response.xpath( "//div[@class='article-details']/h1/text()").extract()[0] # 评分 grade = response.xpath("//div[@class='post-ratings']/text()").extract( )[1].split('(')[1].strip()[0:3] # 图片为两张 pic_list = response.xpath( "//div[@class='article-container post clearfix']/article/p/a[1]/@href" ).extract() pic = ','.join(pic_list) intro = ','.join(intro_list) item['title'] = title item['intro'] = intro item['pic'] = pic item['grade'] = grade # with open('F:/223344.json','a') as f: # f.write(json.dumps(dict(item),ensure_ascii=False)+'\n') yield item