Ejemplo n.º 1
0
 def parse(self, response):
     #
     urls = response.xpath("//h2[@class='block-title']/a/@href").extract()
     names = response.xpath("//h2[@class='block-title']/a/text()").extract()
     for url, name in zip(urls, names):
         item = BookItem()
         item['book_url'] = url
         item['book_name'] = name
         yield item
Ejemplo n.º 2
0
 def parse(self, response):
     try:
         item = BookItem()
         item['book_url'] = response.url
         item['book_name'] = response.xpath("//h3[@class='article_detail_title article_detail_5_title bf ic']/text()") \
             .extract_first()
         yield item
     except Exception as e:
         print(e)
         return
Ejemplo n.º 3
0
 def parse(self, response):
     try:
         urls = response.xpath("//h3[@class='dytit']/a/@href").extract()
         names = response.xpath("//h3[@class='dytit']/a/text()").extract()
         for i in range(len(urls)):
             item = BookItem()
             item['book_url'] = urls[i]
             item['book_name'] = names[i]
             yield item
     except Exception as e:
         print(e)
         return
Ejemplo n.º 4
0
 def douban_parse(self, response):
     item = BookItem()
     item['book_url'] = response.url
     item['book_name'] = response.xpath("//h1/span/text()").extract_first()
     item['book_author'] = response.css('#info a::text').extract_first()
     book_score = response.xpath(
         "//strong[@class='ll rating_num ']/text()").extract_first()
     if book_score:
         item['book_score'] = book_score.strip()
     # item['book_desc'] = '\n'.join(response.css('#link-report p::text').extract())
     item['book_image'] = response.css(
         '#mainpic img::attr(src)').extract_first()
     return item
Ejemplo n.º 5
0
 def parse_item(self, response):
     try:
         item = BookItem()
         item['book_url'] = response.url
         item['book_name'] = convert(
             response.xpath('//div/b/text()').extract()[0], 'zh-cn')
         item['book_desc'] = convert(
             response.xpath('//*[@id="desc_text"]/text()').extract()
             [0].strip(), 'zh-cn')
         yield item
     except Exception as e:
         print(e)
         return
Ejemplo n.º 6
0
 def parse(self, response):
     domain = response.url.split('/', 3)[2]
     httpDomain = '/'.join(response.url.split('/', 3)[:3])
     ic(response.url, domain, httpDomain)
     urls = response.xpath(self.xpath[domain]["urlsXpath"]).extract()
     names = response.xpath(self.xpath[domain]["namesXpath"]).extract()
     ic(len(urls))
     if len(urls) > 0 and urls[0].startswith('/'):
         urls = [httpDomain + i for i in urls]
     for url, name in zip(urls, names):
         item = BookItem()
         item['book_url'] = url
         item['book_name'] = name
         yield item
Ejemplo n.º 7
0
    def parse(self, response):
        try:
            pages = response.xpath("//div[@id='pagenavi']/a/@href").extract()
            for page in pages:
                yield scrapy.Request(url=page, callback=self.parse)
            urls = response.xpath("//dl[@id='plist']/dt/a/@href").extract()
            names = response.xpath("//dl[@id='plist']/dt/a/text()").extract()
            for i in range(len(urls)):
                item = BookItem()
                item['book_url'] = urls[i]
                item['book_name'] = names[i]
                yield item

        except Exception as e:
            print(e)
            return
Ejemplo n.º 8
0
    def parse(self, response):
        try:
            pages = response.xpath("//a[@class='fed-btns-info fed-rims-info']/@href").extract()
            for page in pages:
                yield scrapy.Request(url=f'https://www.itsck.com{page}', callback=self.parse)
            urls = response.xpath("//a[@class='fed-list-title fed-font-xiv fed-text-center "
                                  "fed-text-sm-left fed-visible fed-part-eone']/@href").extract()
            names = response.xpath("//a[@class='fed-list-title fed-font-xiv fed-text-center "
                                   "fed-text-sm-left fed-visible fed-part-eone']/text()").extract()
            for i in range(len(urls)):
                item = BookItem()
                item['book_url'] = f'https://www.itsck.com{urls[i]}'
                item['book_name'] = names[i]
                yield item

        except Exception as e:
            print(e)
            return
Ejemplo n.º 9
0
def parse(self, response):
    try:
        pages = response.xpath("//div[@class='pages']/a/@href") \
            .extract()
        for page in pages:
            yield scrapy.Request(url=f'https://www.pianku.tv{page}',
                                 callback=self.parse)
        urls = response.xpath("//div[@class='li-bottom']/h3/a/@href").extract()
        names = response.xpath(
            "//div[@class='li-bottom']/h3/a/text()").extract()
        for i in range(len(urls)):
            item = BookItem()
            item['book_url'] = f'https://www.pianku.tv{urls[i]}'
            item['book_name'] = names[i]
            yield item

    except Exception as e:
        print(e)
        return
Ejemplo n.º 10
0
    def parse(self, response):
        try:
            pages = response.xpath("//ul[@class='pagelist']/li/a/@href").extract()
            for page in pages:
                if response.url.endswith('html'):
                    yield scrapy.Request(url=f'{response.url.rsplit("/", 1)[0]}/{page}', callback=self.parse)
                else:
                    yield scrapy.Request(url=f'{response.url}{page}', callback=self.parse)
            urls = response.xpath("//div[@class='listbox']/ul[@class='e2']/li/a[@class='title']/@href").extract()
            names = response.xpath("//div[@class='listbox']/ul[@class='e2']/li/a[@class='title']/text()").extract()
            for i in range(len(urls)):
                item = BookItem()
                item['book_url'] = f'http://www.java1234.com{urls[i]}'
                item['book_name'] = names[i]
                yield item

        except Exception as e:
            print(e)
            return
Ejemplo n.º 11
0
    def parse(self, response):
        try:
            pages = response.xpath("//div[@class='pagination pagination-multi']/ul/li/a/@href") \
                .extract()
            for page in pages:
                yield scrapy.Request(url=f'https://dvdhd.me{page}',
                                     callback=self.parse)
            urls = response.xpath(
                "//div[@class='m-movies clearfix']/article[@class='u-movie']/a/@href"
            ).extract()
            names = response.xpath(
                "//div[@class='m-movies clearfix']/article[@class='u-movie']/a/h2/text()"
            ).extract()
            for i in range(len(urls)):
                item = BookItem()
                item['book_url'] = f'https://dvdhd.me{urls[i]}'
                item['book_name'] = names[i]
                yield item

        except Exception as e:
            print(e)
            return