Exemple #1
0
 def parst_detail(self, response):
     item = MovieproItem()
     item['name'] = response.xpath(
         '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
     # item['describe'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[3]/text()').extract_first()
     # item['describe'] = ''.join(item['describe'])
     yield item
 def parse_detail(self, response):
     item = MovieproItem()
     item['name'] = response.xpath(
         '/html/body/metaname="robots"content="noarchive"/div[3]/div[1]/div[2]/dl/dt[1]/text()'
     ).extract_first()
     print(item['name'])
     yield item
Exemple #3
0
 def parse_detail(self, response):
     name = response.xpath(
         '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
     desc = response.xpath(
         '/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()'
     ).extract()
     desc = ''.join(desc)
     item = MovieproItem()
     item['name'] = name
     item['desc'] = desc
     yield item
Exemple #4
0
    def parse_detail(self, response):
        name = response.xpath(
            '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
        m_type = response.xpath(
            '/html/body/div[1]/div/div/div/div[2]/p[1]/a[1]/text()'
        ).extract_first()
        item = MovieproItem()
        item['name'] = name
        item['m_type'] = m_type

        yield item
Exemple #5
0
 def parse(self, response):
     li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]')
     for li in li_list:
         item = MovieproItem()
         name = li.xpath('./div/a/@title').extract_first()
         detail_url = 'https://www.4567tv.tv' + li.xpath(
             './div/a/@href').extract_first()
         item['name'] = name
         #meta参数:请求传参.meta字典就会传递给回调函数的response参数
         yield scrapy.Request(url=detail_url,
                              callback=self.parse_detail,
                              meta={'item': item})
Exemple #6
0
    def parse(self, response):
        div_list = response.xpath(
            '//div[@class="container"]/div[@class="row"]/div[@class="col-md-2 col-sm-6 col-xs-6 movie-item-out"]'
        )
        for div in div_list:
            title = div.xpath('./div/a/@title').extract_first()
            time = div.xpath('.//em/strong/span/text()').extract_first()
            url = 'http://www.id97.vip' + div.xpath(
                './div/a/@href').extract_first()

            item = MovieproItem()

            yield scrapy.Request(url=url,
                                 callback=self.parseBySecondPage,
                                 meta={'item': item})
    def parse(self, response):
        div_list = response.xpath('//div[@class="col-xs-1-5 movie-item"]')
        for div in div_list:
            item = MovieproItem()
            item['name'] = div.xpath(
                './/div[@class="meta"]/h1/a/text()').extract_first()
            item['score'] = div.xpath(
                './/div[@class="meta"]/h1/em/text()').extract_first()
            if item['score'] == None:
                item['score'] = '0'
            detail_url = 'https:' + div.xpath(
                './/div[@class="meta"]/h1/a/@href').extract_first()

            #对详情页的url发请求
            #使用meta参数实现请求传参
            yield scrapy.Request(url=detail_url,
                                 callback=self.getDetailPage,
                                 meta={'item': item})
Exemple #8
0
    def parse(self, response):
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            title = li.xpath('./div/a/@title').extract_first()
            detail_url = 'https://www.4567kan.com' + li.xpath(
                './div/a/@href').extract_first()
            item = MovieproItem()
            item['title'] = title

            # 对详情页url发起请求
            # meta作用:可以将meta字典传递给callback
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        if self.page_num < 3:
            new_url = format(self.url % self.page_num)
            self.page_num += 1
            yield scrapy.Request(url=new_url, callback=self.parse)
Exemple #9
0
    def parse_detail(self, response):
        serial_number = response.xpath('//div[@class="col-md-6 infos"]/h2/text()').extract_first()
        video_name = serial_number
        author_name = response.xpath('//div[@class="col-md-6 infos"]/h4[2]/a/text()').extract()
        author_name = author_name[0] if len(author_name) == 1 else ','.join(author_name)
        video_tag = response.xpath('//div[@class="col-md-6 infos"]/h4[3]/a/text()').extract()
        video_tag = video_tag[0] if len(video_tag) == 1 else ','.join(video_tag)
        publish_date = response.xpath('//div[@class="col-md-6 infos"]/h4[1]/a/text()').extract_first()
        video_ticket = response.xpath('//div[@class="col-md-6 infos"]/p/span/b/text()').extract_first()

        print(serial_number, author_name, video_tag, video_ticket, publish_date)

        item = MovieproItem()
        item['serial_number'] = serial_number
        item['video_name'] = video_name
        item['author_name'] = author_name
        item['video_tag'] = video_tag
        item['publish_date'] = publish_date
        item['video_ticket'] = video_ticket

        yield item
Exemple #10
0
 def parse(self, response):
     # page不能放在这里 这样会造成每次都是同样的page就不会增加了!!!
     # self.page = 2
     li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
     for li in li_list:
         name = li.xpath('./div/a/@title').extract_first()
         item = MovieproItem()
         item['name'] = name
         # 获取详情页url
         detail_url = 'https://www.4567kan.com' + li.xpath(
             './div/a/@href').extract_first()
         # 对详情页url进行手动发送
         # 请求传参:让Request将一个数值(实体 item)(字典)传递给回调函数
         yield scrapy.Request(detail_url,
                              callback=self.parse_detail,
                              meta={'item': item})
     # 分页爬取 不能用for循环会死递归
     # for i in range(2,6):
     if self.page < 6:
         print(self.page)
         new_url = f'https://www.4567kan.com/frim/index7-{self.page}.html'
         self.page += 1
         yield scrapy.Request(new_url, callback=self.parse)