Exemple #1
0
 def parse_by_selector1(self, response):
     print(response.url)
     items = []
     movies = Selector(response=response).xpath('//div[@class="hd"]')
     for movie in movies:
         title = movie.xpath('./a/span/text()')
         link = movie.xpath('./a/@href')
         movie_item = MoviespiderItem()
         movie_item['title'] = title.extract_first()
         movie_item['link'] = link.extract_first()
         yield scrapy.Request(url=link.extract_first(),
                              callback=self.parse_detail_by_selector,
                              meta={'item': movie_item})
Exemple #2
0
 def parse(self, response):
     items = []
     soup = BeautifulSoup(response.text, 'html.parser')
     title_list = soup.find_all('div', attrs={'class': 'hd'})
     for item in title_list:
         movie_item = MoviespiderItem()
         title = item.find('a').find('span', ).text
         link = item.find('a').get('href')
         movie_item['title'] = title
         movie_item['link'] = link
         # items.append(movie_item)
         yield scrapy.Request(url=link,
                              meta={'item': movie_item},
                              callback=self.parse_detail)
Exemple #3
0
    def parse(self, response):

        movie_list_group = response.xpath('//div[@class="info"]')
        for movie_list in movie_list_group:
            item = MoviespiderItem()
            item['movie_title'] = movie_list.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()
            item['movie_other_title'] = movie_list.xpath('div[@class="hd"]/a/span[@class="other"]/text()').extract()
            item['movie_link'] = movie_list.xpath('//div[@class="hd"]/a/@href')[0].extract()
            item['movie_director_actor'] = movie_list.xpath('div[@class="bd"]/p[@class=""]/text()').extract()
            item['movie_star'] = movie_list.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0].extract()
            item['movie_quote'] = movie_list.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            url = movie_list.xpath('//div[@class="hd"]/a/@href')[0].extract()
            yield scrapy.Request(url, callback=self.parse_movie_list_detail, dont_filter=True, meta={'item': item})

        next_page = response.xpath('//span[@class="next"]/a/@href').extract()
        if next_page:
            print(next_page)
            yield scrapy.Request(self.main_url + next_page[0], callback=self.parse)
Exemple #4
0
 def parse_item(self, response):
     item = MoviespiderItem()
     article = response.css('div.article_container')
     name = article.css('h1::text').extract_first()
     image = article.css('.context img::attr(src)').extract_first()
     link = response.url
     ctime = article.css('.article_info .info_date::text').extract_first()
     category = article.css(
         '.article_info .info_category a::text').extract_first()
     description = article.css('div[id=post_content]').extract_first()
     pan = response.css('.context h2').extract()[-1]
     item['name'] = name
     item['image'] = image
     item['link'] = link
     item['ctime'] = ctime
     item['category'] = category
     item['description'] = description
     item['pan'] = pan
     yield item
Exemple #5
0
    def parselist(self, response):

        li_list = response.xpath('//div[@class="listBox"]//ul/li')

        for li in li_list:
            # 创建一个电影的item
            item = MoviespiderItem()

            # 取出电影名字
            name = li.xpath('.//h3/a/text()').extract_first("")
            # 电影的类型
            type = li.xpath(".//p[2]/text()").extract_first("")
            # 每个电影的详情页
            href = li.xpath('.//h3/a/@href').extract_first("")

            item["name"] = name
            item["type"] = type.split(":")[1]
            item["href"] = href
            yield scrapy.Request(url=href,
                                 callback=self.parsedetail,
                                 meta={"item": item})
Exemple #6
0
    def parse(self, response):

        # 从响应体中提取出所有的电影信息
        movie_list = response.xpath("//div[@class='co_content8']//table")
        print(movie_list)
        # 遍历所有的电影,提取出详细的信息
        for movie in movie_list:
            # 创建一个模型
            item = MoviespiderItem()
            # 用item提取一级页面中的内容
            item["title"] = movie.xpath(".//a/text()").extract_first()
            item['date'] = movie.xpath(".//font/text()").extract_first()
            item['info'] = movie.xpath(
                ".//tr[last()]/td/text()").extract_first()

            # 获取二级页面中的内容
            next_url = "http://www.dytt8.net" + movie.xpath(
                ".//a/@href").extract_first()
            # 此时需要继续从二级页面中提取信息,就需要调用下载器继续下载
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_next,
                                 meta={"movie_item": item})
Exemple #7
0
    def parse_1(self, response):
        item = MoviespiderItem()
        # 目标网站所有影片信息放在一个p标签中,相邻p标签中存放'简介'字段
        tag_p_1 = response.xpath(
            "//div[@class='article-container post clearfix']/article/p[1]/text()"
        ).extract()
        # 此处不可以为None
        if tag_p_1 == []:
            tag_p = response.xpath(
                "//div[@class='article-container post clearfix']/article/p[2]/text()"
            ).extract()
            intro_list = response.xpath(
                "//div[@class='article-container post clearfix']/article/p[2]/following-sibling::p/text()"
            ).extract()
        else:
            tag_p = response.xpath(
                "//div[@class='article-container post clearfix']/article/p[1]/text()"
            ).extract()
            intro_list = response.xpath(
                "//div[@class='article-container post clearfix']/article/p[1]/following-sibling::p/text()"
            ).extract()
        # 分解p标签中的内容
        info_dict = {}
        for i in tag_p:
            s = i.split(':')
            info_dict[s[0].strip()] = s[1].strip()
        for key in info_dict:
            if key == '导演':
                item['director'] = info_dict[key]
            elif key == '编剧':
                item['writer'] = info_dict[key]
            elif key == '主演':
                item['actor'] = info_dict[key]
            elif key == '制片国家/地区':
                item['area'] = info_dict[key]
            elif key == '语言':
                item['language'] = info_dict[key]
            elif key == '又名':
                item['other_name'] = info_dict[key]
            elif key == '上映日期':
                item['release_date'] = info_dict[key]
            elif key == '片长':
                item['lenght'] = info_dict[key]
            elif key == '首播':
                item['debut'] = info_dict[key]
            elif key == '集数':
                item['cd'] = info_dict[key]
            elif key == '单集片长':
                item['min'] = info_dict[key]
            elif key == '类型':
                item['type'] = info_dict[key]
            else:
                pass

        # 片名
        title = response.xpath(
            "//div[@class='article-details']/h1/text()").extract()[0]
        # 评分
        grade = response.xpath("//div[@class='post-ratings']/text()").extract(
        )[1].split('(')[1].strip()[0:3]
        # 图片为两张
        pic_list = response.xpath(
            "//div[@class='article-container post clearfix']/article/p/a[1]/@href"
        ).extract()
        pic = ','.join(pic_list)
        intro = ','.join(intro_list)

        item['title'] = title
        item['intro'] = intro
        item['pic'] = pic
        item['grade'] = grade

        # with open('F:/223344.json','a') as f:
        #     f.write(json.dumps(dict(item),ensure_ascii=False)+'\n')
        yield item