Example #1
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://kissmanga.com/
        @returns items 0 0
        @returns request 5 10
        @scrapes url name issues
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # XXX TODO - we ignore the `until` date, and make a full parse
        # of the initial scroll panel (that contain old entries)
        xp = '//div[@class="items"]/div'
        for update in response.xpath(xp):
            manga = Manga()
            # Name
            xp = './/a/text()'
            manga['name'] = update.xpath(xp).extract()
            # URL
            xp = './/a/@href'
            url = update.xpath(xp).extract()
            manga['url'] = response.urljoin(url[0])

            # Parse the manga issues list
            request = scrapy.Request(manga['url'],
                                     self.parse_collection,
                                     meta={'manga': manga})
            yield request
Example #2
0
 def _parse_latest(self, response):
     xp = '//a[@class="list-link"]/@href'
     url = response.xpath(xp).extract_first()
     url = response.urljoin(url)
     manga = Manga(url=url)
     meta = {'manga': manga}
     return response.follow(url, self.parse_collection, meta=meta)
Example #3
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://submanga.org
        @returns items 0 0
        @returns request 5 10
        @scrapes url name issues
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        xp = '//div[@class="timeline-entry"]'
        for update in response.xpath(xp):
            manga = Manga()
            # Name
            xp = './/span[@class="text-mint"]/text()'
            manga['name'] = update.xpath(xp).extract()
            # URL
            xp = './/a/@href'
            manga['url'] = update.xpath(xp).extract()

            # There is not link to the issue, nor date.  We parse the
            # full manga.
            meta = {'manga': manga}
            request = scrapy.Request(manga['url'][0],
                                     self.parse_collection,
                                     meta=meta)
            yield request
Example #4
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url https://www.mangareader.net/popular/3660
        @returns items 0
        @returns request 30 40
        """

        xp = '//div[@class="mangaresultitem"]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = './/div[@class="manga_name"]//a/@href'
            manga['url'] = response.urljoin(item.xpath(xp).extract_first())
            # Rank
            xp = './/div[@class="c1"]/text()'
            manga['rank'] = item.xpath(xp).re(r'(\d+).')
            # Rank order
            manga['rank_order'] = 'ASC'
            meta = {'manga': manga}
            yield response.follow(manga['url'],
                                  self.parse_collection,
                                  meta=meta)

        # Next page
        xp = '//div[@id="sp"]/a[contains(., ">")]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            yield response.follow(next_url, self.parse_catalog)
Example #5
0
    def parse_latest(self, response, until=None):
        """Generate the list of new manga until a date

        @url http://mangaseeonline.us/
        @returns items 0
        @returns request 25 100
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//a[@class="latestSeries"]/@href'
        for url in response.xpath(xp).extract():
            url = response.urljoin(url)
            manga = Manga(url=url)
            meta = {'manga': manga}
            request = scrapy.Request(url, self.parse_collection, meta=meta)
            yield request

        # Check the oldest update date
        xp = '//time[@class="timeago"]/@datetime'
        update_date = response.xpath(xp).extract()[-1].strip()
        update_date = convert_to_date(update_date)
        if update_date < until:
            return
Example #6
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://kissmanga.com/MangaList?page=200
        @returns items 0
        @returns request 25 60
        """

        xp = '//table[@class="listing"]/tr/td[1]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = 'a/@href'
            manga['url'] = response.urljoin(item.xpath(xp).extract_first())
            meta = {'manga': manga}
            request = scrapy.Request(manga['url'],
                                     self.parse_collection,
                                     meta=meta)
            yield request

        # Next page
        xp = '//ul[@class="pager"]/li/a[contains(., "Next")]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, self.parse_catalog)
Example #7
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://unionmangas.net/mangas/a-z/10
        @returns items 0
        @returns request 1 50
        """

        xp = '//div[contains(@class, "bloco-manga")]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = 'a[2]/@href'
            manga['url'] = item.xpath(xp).extract_first()
            # Rank
            xp = 'div[@style="display: none"]/text()'
            manga['rank'] = item.xpath(xp).re(r'([\d.]+) views')
            # Rank order
            manga['rank_order'] = 'DESC'
            meta = {'manga': manga}
            request = scrapy.Request(manga['url'], self.parse_collection,
                                     meta=meta)
            yield request

        # Next page
        xp = '//ul[@class="pagination"]/li/a[contains(., "Next")]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, self.parse_catalog)
Example #8
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://mangafox.me/directory/
        @returns items 0
        @returns request 30 45
        """

        xp = '//ul[@class="list"]/li'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = './/a[@class="title"]/@href'
            manga['url'] = item.xpath(xp).extract_first()
            # Rank
            xp = './/p[@class="info"]/label/text()'
            manga['rank'] = item.xpath(xp).re('(\d+)')
            # Rank order
            manga['rank_order'] = 'ASC'
            meta = {'manga': manga}
            request = scrapy.Request(manga['url'],
                                     self.parse_collection,
                                     meta=meta)
            yield request

        # Next page
        xp = '//a[span[@class="next"]]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            next_url = urljoin(response.url, next_url)
            yield scrapy.Request(next_url, self.parse_catalog)
Example #9
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://submanga.org/mangas
        @returns items 0
        @returns request 1 37
        """

        # Get the page number from the URL
        qs = urlparse.urlparse(response.url).query
        qs = urlparse.parse_qs(qs)
        page = int(qs['page'][0]) if 'page' in qs else 0

        xp = '//div[contains(@class, "item_manga")]'
        for order, item in enumerate(response.xpath(xp)):
            manga = Manga()
            # URL
            xp = 'a/@href'
            manga['url'] = item.xpath(xp).extract_first()
            # Rank
            manga['rank'] = page * 6 * 6 + order + 1
            # Rank order
            manga['rank_order'] = 'ASC'
            meta = {'manga': manga}
            request = scrapy.Request(manga['url'],
                                     self.parse_collection,
                                     meta=meta)
            yield request

        # Next page
        xp = '//ul[@class="pagination"]/li/a[@rel="next"]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, self.parse_catalog)
Example #10
0
    def _parse_subscribe(self, response):
        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            # This is not correct at all, but we can use this to allow
            # the testing for this contract
            manga = Manga(url=response.url)

        xp = '//span[@id="numSubscribe"]/@alt'
        manga['rank'] = response.xpath(xp).extract_first()
        return manga
Example #11
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://bato.to/search_ajax?p=200
        @returns items 0
        @returns request 30 40
        """

        def mb(x, default=0.0):
            return x[0] if x else default

        xp = '//tr[not(@class) and not(@id)]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = './td[1]/strong/a/@href'
            manga['url'] = item.xpath(xp).extract_first()
            # Rank
            # In Batoto there is not rank, but a combination of
            # rating, viewers and followers.
            xp = './td[3]/div/@title'
            rating = float(mb(item.xpath(xp).re(r'([.\d]+)/5')))
            xp = './td[4]/text()'
            viewers = convert_to_number(item.xpath(xp).extract_first())
            xp = './td[5]/text()'
            followers = convert_to_number(item.xpath(xp).extract_first())
            manga['rank'] = (rating + 0.1) * viewers * followers
            # Rank order
            manga['rank_order'] = 'DESC'

            # URL Hack to avoid a redirection. This is used because
            # the download_delay is also added to the redirector.
            #
            # This makes the spider a bit faster, but we still needs
            # to update the real URL in the `parse_collection` side.
            url = manga['url'].split('_/')[-1]
            url = 'http://bato.to/comic/_/comics/%s' % url
            # Also use this URL in the Item to avoid duplicates.
            manga['url'] = url
            meta = {'manga': manga}
            request = scrapy.Request(url, self.parse_collection, meta=meta)
            yield request

        # Next page
        xp = '//tr[@id="show_more_row"]/td/input/@onclick'
        next_page_number = response.xpath(xp).re(r'.*, (\d+)\)')
        if next_page_number:
            next_page_number = int(next_page_number[0]) + 1
            next_url = AJAX_SEARCH % next_page_number
            yield scrapy.Request(next_url, self.parse_catalog)
Example #12
0
    def parse_catalog(self, response):
        """Generate the catalog (list of manga) of the site.

        @url http://mangaseeonline.us/directory/
        @returns items 0
        @returns request 3500-4500
        """

        xp = '//a[@class="ttip"]/@href'
        for url in response.xpath(xp).extract():
            manga = Manga()
            # URL
            manga['url'] = response.urljoin(url)
            meta = {'manga': manga}
            yield response.follow(manga['url'],
                                  self.parse_collection,
                                  meta=meta)
Example #13
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://www.mangahere.cc/mangalist/
        @returns items 0
        @returns request 18000 22000
        """

        xp = '//a[@class="manga_info"]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = './@href'
            url = item.xpath(xp).extract_first()
            manga['url'] = response.urljoin(url)
            meta = {'manga': manga}
            yield response.follow(manga['url'], self.parse_collection,
                                  meta=meta)
Example #14
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://unionmangas.site/
        @returns items 0
        @returns request 10 100
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//a[@class="link-titulo"]/@href'
        for url in response.xpath(xp).extract():
            manga = Manga(url=url)
            meta = {'manga': manga}
            yield response.follow(url, self.parse_collection, meta=meta)
Example #15
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://submanga.org
        @returns items 0
        @returns request 5 10
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//div[@class="timeline-entry"]//a/@href'
        for url in response.xpath(xp).extract():
            manga = Manga(url=url)
            meta = {'manga': manga}
            request = scrapy.Request(url, self.parse_collection, meta=meta)
            yield request
Example #16
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://www.mangahere.co/mangalist/
        @returns items 0 0
        @returns request 15000 20000
        """

        xp = '//a[@class="manga_info"]'
        for item in response.xpath(xp):
            manga = Manga()
            # Name
            xp = './text()'
            manga['name'] = item.xpath(xp).extract()
            # URL
            xp = './@href'
            manga['url'] = response.urljoin(item.xpath(xp).extract()[0])
            meta = {'manga': manga}
            request = scrapy.Request(manga['url'],
                                     self.parse_collection,
                                     meta=meta)
            yield request
Example #17
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://www.mangahere.cc/latest/
        @returns items 0
        @returns request 25 200
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//a[@class="manga_info"]/@href'
        for url in response.xpath(xp).extract():
            url = response.urljoin(url)
            manga = Manga(url=url)
            meta = {'manga': manga}
            request = scrapy.Request(url, self.parse_collection, meta=meta)
            yield request

        # Check the oldest update date
        xp = '//span[@class="time"]/text()'
        update_date = response.xpath(xp).extract()[-1]
        update_date = convert_to_date(update_date)
        if update_date < until:
            return

        # Next page
        xp = '//a[@class="next"]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            meta = {'until': until}
            yield scrapy.Request(next_url, self.parse_latest, meta=meta)
Example #18
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url https://mangadex.org/
        @returns items 0
        @returns request 120
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//a[@class="manga_title"]/@href'
        for url in response.xpath(xp).extract():
            url = response.urljoin(url)
            manga = Manga(url=url)
            meta = {'manga': manga}
            request = response.follow(url, self.parse_collection, meta=meta)
            yield request

        # Check the oldest update date
        xp = '//time/@datetime'
        update_date = response.xpath(xp).extract()[-1].strip()
        update_date = convert_to_date(update_date)
        if update_date < until:
            return

        # Next page
        xp = '//ul[@class="pagination"]/li[@class="active"]' \
            '/following-sibling::li[@class="paging"]/a/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            yield response.follow(next_url, self._parse_issues, meta=meta)
Example #19
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://bato.to
        @returns items 0
        @returns request 25 100
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//td[@colspan="5"]/a[@style="font-weight:bold;"]/@href'
        for url in response.xpath(xp).extract():
            manga = Manga(url=url)
            meta = {'manga': manga}
            request = scrapy.Request(url, self.parse_collection, meta=meta)
            yield request

        # Check the oldest update date
        xp = '//td[contains(@style, "font-size: 11px")]/text()'
        update_date = response.xpath(xp).extract()[-1].strip()
        update_date = convert_to_date(update_date)
        if update_date < until:
            return

        # Next page
        xp = '//a[@title="Older Releases"]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            meta = {'until': until}
            yield scrapy.Request(next_url, self.parse_latest, meta=meta)
Example #20
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url https://mangadex.org/titles/2/1
        @returns items 0
        @returns request 40
        """

        xp = '//div[@class="row"]/div'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = './/a[@class="manga_title"]/@href'
            url = item.xpath(xp).extract_first()
            manga['url'] = response.urljoin(url)
            meta = {'manga': manga}
            yield response.follow(url, self.parse_collection, meta=meta)

        # Next page
        xp = '//ul[@class="pagination"]/li[@class="active"]' \
            '/following-sibling::li[@class="paging"]/a/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            yield response.follow(next_url, self.parse_catalog)
Example #21
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url https://www.mangareader.net/latest
        @returns items 0
        @returns request 25 100
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//a[@class="chapter"]/@href'
        for url in response.xpath(xp).extract():
            url = response.urljoin(url)
            manga = Manga(url=url)
            meta = {'manga': manga}
            yield response.follow(url, self.parse_collection, meta=meta)

        # Check the oldest update date
        xp = '//td[@class="c1"]/text()'
        update_date = response.xpath(xp).extract()[-1]
        update_date = convert_to_date(update_date)
        if update_date < until:
            return

        # Next page
        xp = '//div[@id="latest"]/div[@id="sp"]/a[contains(., ">")]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            meta = {'until': until}
            yield response.follow(next_url, self.parse_latest, meta=meta)
Example #22
0
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url https://bato.to/browse?page=2
        @returns items 0
        @returns request 50 70
        """

        xp = '//div[@id="series-list"]//div[@class="item-text"]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = './a/@href'
            url = item.xpath(xp).extract_first()
            manga['url'] = response.urljoin(url)
            meta = {'manga': manga}
            yield response.follow(url, self.parse_collection, meta=meta)

        # Next page
        re_ = r'@click="onClickPage\((.*)\)"'
        next_page_number = re.findall(re_, response.body_as_unicode())[-1]
        if next_page_number:
            next_url = NEXT_PAGE % next_page_number
            yield response.follow(next_url, self.parse_catalog)
Example #23
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url https://mangadex.org/manga/39/one-piece
        @returns items 0
        @returns request 1
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//h3[@class="panel-title"]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//th[contains(text(),"%s")]' \
            '/following-sibling::td/descendant-or-self::*/text()'
        manga['alt_name'] = response.xpath(xp % 'Alt name(s):').extract()
        # Author
        manga['author'] = response.xpath(xp % 'Author:').re(r'([^,]+),?')
        # Artist
        manga['artist'] = response.xpath(xp % 'Artist:').re(r'([^,]+),?')
        # Reading direction
        xp = '//h3[@class="panel-title"]/img/@alt'
        manga['reading_direction'] = response.xpath(xp).extract_first()
        # Status
        xp = '//th[contains(text(),"%s")]' \
            '/following-sibling::td/descendant-or-self::*/text()'
        manga['status'] = response.xpath(xp % 'Pub. status:').extract_first()
        # Genres
        demographic = response.xpath(xp % 'Demographic:').extract()
        genres = response.xpath(xp % 'Genres:').extract()
        manga['genres'] = demographic + genres
        # Rank
        rank = response.xpath(xp % 'Rating:').extract_first()
        manga['rank'] = 100 * convert_to_number(rank)
        # Rank order
        manga['rank_order'] = 'DESC'
        # Description
        manga['description'] = response.xpath(xp % 'Description:').extract()
        # Cover image
        xp = '//img[@class="border-radius"]/@src'
        url = response.xpath(xp).extract_first()
        manga['image_urls'] = [response.urljoin(url)]

        # Information needed to deduce the issue order
        xp = '//p[@class="text-center"]/text()'
        chapters = response.xpath(xp).re_first(r'of (.*) chapters')
        if chapters:
            chapters = convert_to_number(chapters, as_int=True)
        else:
            xp = '//tr[contains(@id,"chapter_")]'
            chapters = len(response.xpath(xp))

        # If the manga is empty (is frequent in MangaDex), end the
        # processing
        if not chapters:
            return

        # Parse the manga issues list
        manga['issues'] = []
        meta = {
            'manga': manga,
            'chapters': chapters,
        }
        url = response.url + '/chapters/1'
        return response.follow(url, self._parse_issues, meta=meta)
Example #24
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://kissmanga.com/Manga/Naruto
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//div[@class="barContent"]//a[@class="bigChar"]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//span[@class="info" and contains(text(), "%s")]' \
             '/following-sibling::a/text()'
        manga['alt_name'] = response.xpath(xp % 'Other name:').extract()
        # Author
        manga['author'] = response.xpath(xp % 'Author:').extract()
        # Artist
        manga['artist'] = manga['author']
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Genres
        manga['genres'] = response.xpath(xp % 'Genres:').extract()
        # Status
        xp = '//span[@class="info" and contains(text(), "%s")]' \
             '/following-sibling::text()[1]'
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Rank
        manga['rank'] = response.xpath(xp % 'Views:').re(r'(\d+).')
        # Rank order
        manga['rank_order'] = 'DESC'
        # Description
        xp = '//p[span[@class="info" and contains(text(), "%s")]]'\
             '/following-sibling::p[1]/text()'
        manga['description'] = response.xpath(xp % 'Summary:').extract()
        # Cover image
        xp = '//div[@id="rightside"]//img/@src'
        url = response.xpath(xp).extract_first()
        manga['image_urls'] = [response.urljoin(url)]

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//table[@class="listing"]/tr[td]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language='EN')
            # Name
            xp = './/a/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            # Some examples that this regex needs to address
            #   1/11 Vol.003 Ch.009.006: Omake 004-koma
            #   21st Century Boys 014
            #   Mob Psycho 100 Ch.099.001: Mob
            #   Mob Psycho 100 Ch.098.002
            #   Fantastic World Vol.001 Ch.002
            #   Black Clover 118 - Mage X
            #   Black Clover 099: Family
            xp = './/a/text()'
            number = line.xpath(xp).re(
                r'(?:[Cc]h.|[Ee]p.|[Cc]haper|[Pp]art.)(\d[.\d]+)'
                r'|(\d[.\d]+)[ :-]+'
                r'|(\d[.\d]+)$')
            issue['number'] = number
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './td[2]/text()'
            issue['release'] = line.xpath(xp).re(r'\d{1,2}/\d{1,2}/\d{4}')
            # URL
            xp = './/a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        return manga
Example #25
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://www.mangahere.cc/manga/angel_densetsu/
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # MangaHere returns 200 for 404 pages
        xp = '//div[@class="error_404"]'
        if response.xpath(xp).extract():
            return

        # Check if manga is licensed
        xp = '//div[@class="detail_list"]/div[@class="mt10 color_ff00 mb10"]'
        if response.xpath(xp).extract():
            return

        # URL
        manga['url'] = response.url
        # Name
        xp = '//meta[@property="og:title"]/@content'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//li[label[contains(text(),"%s")]]/text()'
        manga['alt_name'] = response.xpath(
            xp % 'Alternative Name:').re(r'([^;]+)')
        # Author
        xp = '//li[label[contains(text(),"%s")]]/a/text()'
        manga['author'] = response.xpath(xp % 'Author(s):').extract()
        # Artist
        manga['artist'] = response.xpath(xp % 'Artist(s):').extract()
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status
        xp = '//li[label[contains(text(),"%s")]]/text()'
        manga['status'] = response.xpath(xp % 'Status:').extract_first()
        # Genres
        manga['genres'] = response.xpath(xp % 'Genre(s):').re(r'([^,]+)')
        # Rank
        manga['rank'] = response.xpath(xp % 'Rank:').extract()
        # Rank order
        manga['rank_order'] = 'ASC'
        # Description
        xp = '//li[label[contains(text(),"%s")]]/p[@id="show"]/text()'
        manga['description'] = response.xpath(xp % 'Summary:').extract()
        # Cover image
        xp = '//img[@class="img"]/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//div[@class="detail_list"]/ul[not(@class)]/li'
        lines = response.xpath(xp)

        # Check if the lines are empty
        if len(lines) == 1 and 'No Manga Chapter' in lines[0].extract():
            return

        for line in lines:
            issue = Issue(language='EN')
            # Name
            xp = './/a/text()'
            name_1 = line.xpath(xp).extract()
            xp = './/span[@class="mr6"]/text()'
            name_2 = line.xpath(xp).extract()
            xp = './/span[@class="left"]/text()'
            name_3 = line.xpath(xp).extract()
            issue['name'] = name_1 + name_2 + name_3
            # Number
            xp = './/a/text()'
            issue['number'] = line.xpath(xp).re(
                r'([.\d]+)\s*$')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './/span[@class="right"]/text()'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './/a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        return manga
Example #26
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://bato.to
        @returns items 1 100
        @returns request 0 1
        @scrapes url name issues
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        xp = './/tr[contains(@class, "row")]'
        last_row, manga = None, None
        for update in response.xpath(xp):
            row = update.xpath('@class').extract()[0].split()[0]
            if row != last_row:
                if manga:
                    yield manga
                manga = Manga(issues=[])
                # Name
                xp = 'td[2]/a[2]/text()'
                manga['name'] = update.xpath(xp).extract()
                # URL
                xp = 'td[2]/a[2]/@href'
                manga['url'] = update.xpath(xp).extract()
            else:
                issue = Issue()
                # Name
                xp = './/td/a[img/@style="vertical-align:middle;"]/text()'
                issue['name'] = update.xpath(xp).extract()
                # Number
                issue['number'] = update.xpath(xp).re(
                    r'Ch.(?:Story )?([.\d]+)')
                # Order
                # This is only an estimation for now
                issue['order'] = issue['number']
                # Language
                xp = './/td/div/@title'
                issue['language'] = update.xpath(xp).extract()
                # Release
                xp = './/td[last()]/text()'
                issue['release'] = update.xpath(xp).extract()
                # URL
                xp = './/td/a[img/@style="vertical-align:middle;"]/@href'
                url = update.xpath(xp).extract()[0]
                issue['url'] = response.urljoin(url)

                # Check if is a new update
                update_date = convert_to_date(issue['release'][0].strip())
                if update_date < until:
                    return

                manga['issues'].append(issue)
            last_row = row

        # Return the last manga
        if manga:
            yield manga

        # Next page
        xp = '//a[@title="Older Releases"]/@href'
        next_url = response.xpath(xp).extract()
        if next_url:
            next_url = response.urljoin(next_url[0])
            meta = {'until': until}
            yield scrapy.Request(next_url, self.parse_latest, meta=meta)
Example #27
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://bato.to/comic/_/comics/angel-densetsu-r460
        @returns items 1 1
        @returns request 0 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        # Batoto can have different URLs for comics (_/comics/,
        # _/sp/), so here we update the manga with the real one.
        manga['url'] = response.url
        # Name
        xp = '//h1[@class="ipsType_pagetitle"]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//td[contains(text(),"%s")]/following-sibling::td/*/text()'
        manga['alt_name'] = response.xpath(xp % 'Alt Names:').re(r'([^,;]+)')
        # Author
        manga['author'] = response.xpath(xp % 'Author:').extract()
        # Artist
        manga['artist'] = response.xpath(xp % 'Artist:').extract()
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status
        xp = '//td[contains(text(),"%s")]/following-sibling::td/text()'
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Genres
        xp = '//td[contains(text(),"%s")]/following-sibling::td/a/span/text()'
        manga['genres'] = response.xpath(xp % 'Genres:').extract()
        # Description
        xp = '//td[contains(text(),"%s")]/following-sibling::td/text()'
        manga['description'] = response.xpath(xp % 'Description:').extract()
        # Cover image
        xp = '//div[@class="ipsBox"]/div/div/img/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//tr[contains(@class,"chapter_row")' \
             ' and not(contains(@class,"chapter_row_expand"))]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue()
            # Name
            xp = './td[1]/a/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            issue['number'] = line.xpath(xp).re(r'Ch.(?:Story )?([.\d]+)')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Language
            xp = './td[2]/div/@title'
            issue['language'] = line.xpath(xp).extract()
            # Release
            xp = './td[5]/text()'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './td[1]/a/@href'
            url = line.xpath(xp).extract()[0]
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        yield manga
Example #28
0
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://www.mangahere.co/latest/
        @returns items 1 100
        @returns request 0 1
        @scrapes url name issues
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        xp = '//div[@class="manga_updates"]/dl'
        for update in response.xpath(xp):
            # Check if is a new update
            xp = './/span[@class="time"]/text()'
            update_date = update.xpath(xp).extract()
            update_date = convert_to_date(update_date[0])
            if update_date < until:
                return

            manga = Manga()
            # Name
            xp = './/a[@class="manga_info"]/text()'
            manga['name'] = update.xpath(xp).extract()
            # URL
            xp = './/a[@class="manga_info"]/@href'
            url = update.xpath(xp).extract()
            manga['url'] = response.urljoin(url[0])

            # Parse the manga issues list
            manga['issues'] = []
            xp = './dd'
            for line in update.xpath(xp):
                issue = Issue(language='EN')
                # Name
                xp = 'a/text()'
                issue['name'] = line.xpath(xp).extract()
                # Number
                xp = 'a/text()'
                issue['number'] = line.xpath(xp).re(r'([.\d]+)\s*$')
                # Order
                # This is only an estimation for now
                issue['order'] = issue['number']
                # Release
                issue['release'] = update_date
                # URL
                xp = 'a/@href'
                url = line.xpath(xp).extract()
                issue['url'] = response.urljoin(url[0])
                manga['issues'].append(issue)
            yield manga

        # Next page
        xp = '//a[@class="next"]/@href'
        next_url = response.xpath(xp).extract()
        if next_url:
            next_url = response.urljoin(next_url[0])
            meta = {'until': until}
            yield scrapy.Request(next_url, self.parse_latest, meta=meta)
Example #29
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://unionmangas.net/manga/bleach
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description image_urls issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//div[@class="col-md-12"]/h2/text()'
        manga['name'] = response.xpath(xp).extract_first()
        # Alternate name
        manga['alt_name'] = manga['name']
        # Author
        xp = '//label[contains(text(), "%s")]/following-sibling::text()'
        manga['author'] = response.xpath(xp % 'Autor:').extract()
        # Artist
        manga['artist'] = response.xpath(xp % 'Artista:').extract()
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status (Ativo / Completo)
        xp = '//label[contains(text(), "Status:")]' \
             '/following-sibling::span/text()'
        manga['status'] = response.xpath(xp).extract()
        # Genres
        xp = u'//label[contains(text(), "GĂȘnero(s):")]' \
             u'/following-sibling::a/text()'
        manga['genres'] = response.xpath(xp).extract()
        # Description
        xp = '//div[@class="panel-body"]/text()'
        manga['description'] = response.xpath(xp).extract()
        # Cover image
        xp = '//img[@class="img-thumbnail"]/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//div[@class="col-xs-6 col-md-6"]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language='PT')
            # Name
            xp = 'a/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            issue['number'] = line.xpath(xp).re(r'Cap. ([.\d]+)$')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = 'span/text()'
            issue['release'] = line.xpath(xp).re(r'\d{2}/\d{2}/\d{4}')
            # URL
            xp = 'a/@href'
            issue['url'] = line.xpath(xp).extract()
            manga['issues'].append(issue)
        return manga
Example #30
0
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://submanga.org/bloody-cross
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres description image_urls issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//a[@class="btn-link text-semibold text-mint"][2]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//span[contains(text(), "%s")]/following-sibling::text()'
        title = u'TĂ­tulos alternativos:'
        manga['alt_name'] = response.xpath(xp % title).re(r'([^,;]+)')
        # Author
        manga['author'] = response.xpath(xp % 'Autor:').extract()
        # Artist
        manga['artist'] = manga['author']
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status (Finalizado / En curso)
        xp = '//span[@class="text-2x text-thin"]/text()'
        manga['status'] = response.xpath(xp).extract()
        # Genres
        xp = '//span[contains(text(), "Generos:")]/following-sibling::a/text()'
        manga['genres'] = response.xpath(xp).extract()
        # Description
        xp = '//p[@class="text-justify"]/text()'
        manga['description'] = response.xpath(xp).extract()
        # Cover image
        xp = '//img[@class="img-cover-m"]/@src'
        manga['image_urls'] = response.xpath(xp).extract()

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//table[@id="caps-list"]//tr'
        lines = response.xpath(xp)
        for line in lines:
            xp = 'td[3]/a/img/@alt'
            langs = line.xpath(xp).extract()
            for lang in langs:
                issue = Issue(language=lang)
                # Name
                xp = 'td[2]/a/text()'
                issue['name'] = line.xpath(xp).extract()
                # Number
                xp = 'td[1]/a/text()'
                issue['number'] = line.xpath(xp).extract_first()
                # Order
                issue['order'] = int(issue['number'])
                # Release
                xp = 'td[4]/a/span/text()'
                issue['release'] = line.xpath(xp).extract()
                # URL
                xp = 'td[1]/a/@href'
                url = line.xpath(xp).extract_first()
                issue['url'] = '%s/%s' % (url, lang)
                manga['issues'].append(issue)
        return manga