def parst_detail(self, response): item = MovieproItem() item['name'] = response.xpath( '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() # item['describe'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[3]/text()').extract_first() # item['describe'] = ''.join(item['describe']) yield item
def parse_detail(self, response): item = MovieproItem() item['name'] = response.xpath( '/html/body/metaname="robots"content="noarchive"/div[3]/div[1]/div[2]/dl/dt[1]/text()' ).extract_first() print(item['name']) yield item
def parse_detail(self, response): name = response.xpath( '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() desc = response.xpath( '/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()' ).extract() desc = ''.join(desc) item = MovieproItem() item['name'] = name item['desc'] = desc yield item
def parse_detail(self, response): name = response.xpath( '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() m_type = response.xpath( '/html/body/div[1]/div/div/div/div[2]/p[1]/a[1]/text()' ).extract_first() item = MovieproItem() item['name'] = name item['m_type'] = m_type yield item
def parse(self, response): li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]') for li in li_list: item = MovieproItem() name = li.xpath('./div/a/@title').extract_first() detail_url = 'https://www.4567tv.tv' + li.xpath( './div/a/@href').extract_first() item['name'] = name #meta参数:请求传参.meta字典就会传递给回调函数的response参数 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
def parse(self, response): div_list = response.xpath( '//div[@class="container"]/div[@class="row"]/div[@class="col-md-2 col-sm-6 col-xs-6 movie-item-out"]' ) for div in div_list: title = div.xpath('./div/a/@title').extract_first() time = div.xpath('.//em/strong/span/text()').extract_first() url = 'http://www.id97.vip' + div.xpath( './div/a/@href').extract_first() item = MovieproItem() yield scrapy.Request(url=url, callback=self.parseBySecondPage, meta={'item': item})
def parse(self, response): div_list = response.xpath('//div[@class="col-xs-1-5 movie-item"]') for div in div_list: item = MovieproItem() item['name'] = div.xpath( './/div[@class="meta"]/h1/a/text()').extract_first() item['score'] = div.xpath( './/div[@class="meta"]/h1/em/text()').extract_first() if item['score'] == None: item['score'] = '0' detail_url = 'https:' + div.xpath( './/div[@class="meta"]/h1/a/@href').extract_first() #对详情页的url发请求 #使用meta参数实现请求传参 yield scrapy.Request(url=detail_url, callback=self.getDetailPage, meta={'item': item})
def parse(self, response): li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: title = li.xpath('./div/a/@title').extract_first() detail_url = 'https://www.4567kan.com' + li.xpath( './div/a/@href').extract_first() item = MovieproItem() item['title'] = title # 对详情页url发起请求 # meta作用:可以将meta字典传递给callback yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}) if self.page_num < 3: new_url = format(self.url % self.page_num) self.page_num += 1 yield scrapy.Request(url=new_url, callback=self.parse)
def parse_detail(self, response): serial_number = response.xpath('//div[@class="col-md-6 infos"]/h2/text()').extract_first() video_name = serial_number author_name = response.xpath('//div[@class="col-md-6 infos"]/h4[2]/a/text()').extract() author_name = author_name[0] if len(author_name) == 1 else ','.join(author_name) video_tag = response.xpath('//div[@class="col-md-6 infos"]/h4[3]/a/text()').extract() video_tag = video_tag[0] if len(video_tag) == 1 else ','.join(video_tag) publish_date = response.xpath('//div[@class="col-md-6 infos"]/h4[1]/a/text()').extract_first() video_ticket = response.xpath('//div[@class="col-md-6 infos"]/p/span/b/text()').extract_first() print(serial_number, author_name, video_tag, video_ticket, publish_date) item = MovieproItem() item['serial_number'] = serial_number item['video_name'] = video_name item['author_name'] = author_name item['video_tag'] = video_tag item['publish_date'] = publish_date item['video_ticket'] = video_ticket yield item
def parse(self, response): # page不能放在这里 这样会造成每次都是同样的page就不会增加了!!! # self.page = 2 li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: name = li.xpath('./div/a/@title').extract_first() item = MovieproItem() item['name'] = name # 获取详情页url detail_url = 'https://www.4567kan.com' + li.xpath( './div/a/@href').extract_first() # 对详情页url进行手动发送 # 请求传参:让Request将一个数值(实体 item)(字典)传递给回调函数 yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) # 分页爬取 不能用for循环会死递归 # for i in range(2,6): if self.page < 6: print(self.page) new_url = f'https://www.4567kan.com/frim/index7-{self.page}.html' self.page += 1 yield scrapy.Request(new_url, callback=self.parse)